In [1]:
import string
from typing import Optional
from dataclasses import dataclass
from collections import defaultdict
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import cached_property
from math import log

from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer

np.random.seed(42)

## Расчет метрик

In [2]:
query_product_interactions = pd.read_parquet("query_product_interactions.parquet")
query_positives = {
    row[1]["search_query"]: set([x[0] for x in row[1]["products"] if x[1] > 5]) 
    for row in query_product_interactions.iterrows()
}
good_enough_query_positives = {
    k: v
    for k, v in query_positives.items()
    if len(v) > 50
}
validation_queries_set = np.random.choice(
    list(good_enough_query_positives.keys()), size=200, replace=False
).tolist()
validation_query_positives_dict = {
    k: v
    for k, v in query_positives.items()
    if k in validation_queries_set
}

In [4]:
query_product_interactions.tail()

Unnamed: 0,search_query,products
263069,яйчный белок,"[[933874938, 1]]"
263070,якрупа,"[[427186572, 1]]"
263071,янбупели,"[[158178266, 1]]"
263072,японские чистящие средства,"[[762090235, 1], [685325392, 1]]"
263073,яшоды смесь,"[[356053164, 1], [1061471790, 2], [684932688, 1]]"


In [5]:
pd.DataFrame(
    {
        "query": list(validation_query_positives_dict.keys()),
        "products": list(validation_query_positives_dict.values()),
    }
).to_parquet("validation_query_positives.parquet")

In [6]:
validation_query_positives = pd.read_parquet("validation_query_positives.parquet")

In [7]:
validation_query_positives_dict = {
    row[1].query: set(row[1].products.tolist()) for row in validation_query_positives.iterrows()
}

In [8]:
validation_query_positives_dict

{'каша молочная детская': {140676397,
  140676406,
  140676407,
  140676408,
  141025733,
  141025734,
  141530480,
  141530481,
  141530483,
  141530484,
  141822580,
  141822582,
  141822583,
  141822587,
  141822591,
  141822592,
  141822595,
  141822598,
  141822600,
  141822601,
  141822602,
  142877133,
  142877134,
  146804896,
  146804898,
  146804905,
  146805007,
  146805009,
  146805010,
  149989767,
  150137879,
  150137884,
  150137885,
  150137887,
  153443095,
  153443096,
  153443097,
  153443099,
  153443100,
  153443101,
  153443102,
  153443103,
  153443106,
  153443107,
  153443108,
  153443109,
  154599318,
  154599321,
  154599323,
  154599325,
  161270710,
  161270711,
  161270712,
  161270713,
  161270714,
  161270715,
  161270716,
  169591356,
  183388924,
  205495294,
  205495295,
  205495296,
  275630677,
  275630688,
  277805186,
  281225223,
  317906273,
  723888643,
  723888763,
  723888768,
  723888876,
  753390931,
  753391211},
 'пюре детское': {1406764

In [9]:
@dataclass
class Metrics:
    precision: float
    recall: float
    f1_score: float
    
    def __repr__(self):
        return f"precision = {self.precision}\nrecall = {self.recall}\nf1_score = {self.f1_score}"


In [10]:
def calculate_metrics(ground_truth_set, search_results_set):
    
    # True positives: items that are both in ground truth and search results
    tp = len(ground_truth_set.intersection(search_results_set))
    
    # Precision: tp / (tp + fp)
    precision = tp / len(search_results_set) if len(search_results_set) > 0 else 0.0
    
    # Recall: tp / (tp + fn)
    recall = tp / len(ground_truth_set) if len(ground_truth_set) > 0 else 0.0
    
    # F1-score: harmonic mean of precision and recall
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return Metrics(precision=precision, recall=recall, f1_score=f1_score)


In [11]:
def calculate_validation_metrics(search_function, limit=20):
    metrics = []
    for query, positives in validation_query_positives_dict.items():
        metrics.append(
            calculate_metrics(positives, search_function(query=query, limit=limit))
        )
    
    return Metrics(
        precision=np.mean([x.precision for x in metrics]),
        recall=np.mean([x.recall for x in metrics]),
        f1_score=np.mean([x.f1_score for x in metrics]),
    )
    

## Обработка документов

In [12]:
dataset = pd.read_parquet("products_with_names.parquet")

In [13]:
documents_dict = {
    doc[1]["product_id"]: doc[1]["name"] for doc in dataset.iterrows()
}

In [14]:
@dataclass
class Document:
    doc_id: int
    name: str

documents = [Document(doc_id=doc[1]["product_id"], name=doc[1]["name"]) for doc in dataset.iterrows()]


In [15]:
class TextProcessor:
    def __init__(self):
        self.symbols_to_replace = {"ё": "е"}
        self.stopwords = set(stopwords.words("russian"))
        self.linguist = MorphAnalyzer()

    def lowercase_text(self, text: str) -> str:
        return text.lower()

    def replace_symbols(self, text: str) -> str:
        for old, new in self.symbols_to_replace.items():
            text = text.replace(old, new)
        return text

    def process_punctuation_simple(self, text: str) -> str:
        translation_table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
        text_without_punc = text.translate(translation_table)
        text_without_double_spaces = ' '.join(text_without_punc.split())
        return text_without_double_spaces

    def tokenize_simple(self, text: str) -> list[str]:
        return text.split()

    def remove_stopwords(self, doc: list[str]) -> list[str]:
        return [token for token in doc if token not in self.stopwords]

    def lemmatize_token(self, token: str) -> str:
        return self.linguist.normal_forms(token)[0]
    
    def lemmatize_tokenized_text(self, tokenized_text: list[str]) -> list[str]:
        return [self.lemmatize_token(token) for token in tokenized_text]

    def process_text(self, text: str) -> list[str]:
        text = self.lowercase_text(text)
        text = self.replace_symbols(text)
        text = self.process_punctuation_simple(text)
        text_tokens = self.tokenize_simple(text)
        text_tokens = self.remove_stopwords(text_tokens)
        return self.lemmatize_tokenized_text(text_tokens)
        

In [16]:
text_processor = TextProcessor()

In [17]:
documents_processed = [(document.doc_id, text_processor.process_text(document.name)) for document in tqdm(documents)]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 238443/238443 [02:04<00:00, 1919.85it/s]


In [18]:
def print_ozon_link(product_id: int) -> str:
    print(f"https://ozon.ru/product/{product_id}")

## Абстрактный поисковый движок

In [19]:
from abc import ABC, abstractmethod


class SearchEngine(ABC):
    @abstractmethod
    def index_documents(self, documents_processed: list[tuple[int, list[str]]]) -> None: ...

    @abstractmethod
    def search(self, query: str, limit: Optional[int] = None) -> dict[int, float]: ...


## Поиск по обратному индексу с ранжированием по term frequency

In [20]:
class SearchEngineTf(SearchEngine):
    def __init__(self, text_processor: TextProcessor):
        super().__init__()
        self._index: dict[str, dict[int, int]] = defaultdict(lambda: defaultdict(int))
        self._documents: set[int] = set()
        self._text_processor = text_processor

    def index_documents(self, documents_processed: list[tuple[int, list[str]]]) -> None:
        for doc_id, name in documents_processed:
            self.index_document(doc_id, name)

    def index_document(self, doc_id: int, name: list[str]) -> None:
        self._documents.add(doc_id)
        for token in name:
            self._index[token][doc_id] += 1

    def search(self, query: str, limit: Optional[int] = None) -> dict[int, float]:
        query_tokens = self._text_processor.process_text(query)
        product_scores: dict[int, float] = {}
        for query_token in query_tokens:
            document_scores = self._index[query_token]
            product_scores = self.update_product_scores(product_scores, document_scores)
        return [x[0] for x in sorted(product_scores.items(), key=lambda x: x[1], reverse=True)][:limit]

    def update_product_scores(self, old: dict[int, float], new: dict[int, float]) -> dict[int, float]:
        for doc_id, score in new.items():
            if doc_id in old:
                old[doc_id] += score
            else:
                old[doc_id] = score
        return old


In [21]:
search_engine_tf = SearchEngineTf(text_processor)

In [22]:
search_engine_tf.index_documents(documents_processed)

In [23]:
calculate_validation_metrics(search_engine_tf.search, limit=200)

precision = 0.12972154711551073
recall = 0.27398067131964515
f1_score = 0.16268151296182867

## Поиск по обратному индексу с ранжированием по tf-idf

In [28]:
class SearchEngineTfIdf(SearchEngine):
    def __init__(self, text_processor: TextProcessor):
        self._index: dict[str, dict[int, int]] = defaultdict(lambda: defaultdict(int))
        self._documents: set[int] = set()
        self._text_processor = text_processor

    def index_documents(self, documents_processed: list[tuple[int, list[str]]]) -> None:
        for doc_id, name in documents_processed:
            self.index_document(doc_id, name)

    def index_document(self, doc_id: int, name: list[str]) -> None:
        self._documents.add(doc_id)
        for token in name:
            self._index[token][doc_id] += 1

    @cached_property
    def number_of_documents(self) -> int:
        return len(self._documents)

    def idf(self, query_token: str) -> float:
        N = self.number_of_documents
        n_query_token_docs = len(self._index[query_token])
        return log((N - n_query_token_docs + 0.5) / (n_query_token_docs + 0.5))

    def _get_query_token_document_scores(self, query_token: str) -> dict[int, float]:
        idf = self.idf(query_token)
        return {doc_id: tf * idf for doc_id, tf in self._index[query_token].items()}
    
    def search(self, query: str, limit: Optional[int] = None) -> dict[int, float]:
        query_tokens = self._text_processor.process_text(query)
        product_scores: dict[int, float] = {}
        for query_token in query_tokens:
            document_scores = self._get_query_token_document_scores(query_token)
            product_scores = self.update_product_scores(product_scores, document_scores)
        return [x[0] for x in sorted(product_scores.items(), key=lambda x: x[1], reverse=True)][:limit]
        
    def update_product_scores(self, old: dict[int, float], new: dict[int, float]) -> dict[int, float]:
        for doc_id, score in new.items():
            if doc_id in old:
                old[doc_id] += score
            else:
                old[doc_id] = score
        return old


In [29]:
search_engine_tf_idf = SearchEngineTfIdf(text_processor)

In [30]:
search_engine_tf_idf.index_documents(documents_processed)

In [31]:
calculate_validation_metrics(search_engine_tf_idf.search, limit=200)

precision = 0.12904654711551072
recall = 0.2721263760432715
f1_score = 0.16172968197573453

## Поиск по обратному индексу с ранжированием по bm25

In [32]:
class SearchEngineBm25(SearchEngine):
    def __init__(self, text_processor: TextProcessor, k1: float = 1.2, b: float = 0.75):
        self._index: dict[str, dict[int, int]] = defaultdict(lambda: defaultdict(int))
        self._documents: dict[int, list[str]] = {}
        self._text_processor = text_processor
        self.k1 = k1
        self.b = b

    def index_documents(self, documents_processed: list[tuple[int, list[str]]]) -> None:
        for doc_id, name in documents_processed:
            self.index_document(doc_id, name)

    def index_document(self, doc_id: int, name: list[str]) -> None:
        self._documents[doc_id] = name
        for token in name:
            self._index[token][doc_id] += 1

    @cached_property
    def number_of_documents(self) -> int:
        return len(self._documents)

    @cached_property
    def avdl(self) -> float:
        return sum(len(doc) for doc in self._documents.values()) / self.number_of_documents

    def idf(self, query_token: str) -> float:
        N = self.number_of_documents
        n_query_token_docs = len(self._index[query_token])
        return log((N - n_query_token_docs + 0.5) / (n_query_token_docs + 0.5))

    def _get_query_token_document_scores(self, query_token: str) -> dict[int, float]:
        idf = self.idf(query_token)
        return {
            doc_id: (self.k1 + 1) * tf / (tf + self.k1 * (1 - self.b + self.b * len(self._documents[doc_id]) / self.avdl)) * idf
            for doc_id, tf in self._index[query_token].items()
        }
    
    def search(self, query: str, limit: Optional[int] = None) -> dict[int, float]:
        query_tokens = self._text_processor.process_text(query)
        product_scores: dict[int, float] = {}
        for query_token in query_tokens:
            document_scores = self._get_query_token_document_scores(query_token)
            product_scores = self.update_product_scores(product_scores, document_scores)
        return [x[0] for x in sorted(product_scores.items(), key=lambda x: x[1], reverse=True)][:limit]
        
    def update_product_scores(self, old: dict[int, float], new: dict[int, float]) -> dict[int, float]:
        for doc_id, score in new.items():
            if doc_id in old:
                old[doc_id] += score
            else:
                old[doc_id] = score
        return old


In [45]:
search_engine_bm25 = SearchEngineBm25(text_processor, k1=1.2, b=0.75)

In [46]:
search_engine_bm25.index_documents(documents_processed)

In [47]:
calculate_validation_metrics(search_engine_bm25.search, limit=200)

precision = 0.15347154711551073
recall = 0.3330483061933653
f1_score = 0.19553645047647358

## Поиск по обратному индексу на n-gram-ах с ранжированием по BM25 

In [63]:
from collections import defaultdict
from typing import List, Tuple, Optional, Dict, Set
import itertools


class SearchEngineNgramBm25(SearchEngine):
    def __init__(self, text_processor: TextProcessor, k1: float = 1.2, b: float = 0.75, n_gram_size: int = 2):
        super().__init__()
        self._index: Dict[str, Dict[int, int]] = defaultdict(lambda: defaultdict(int))
        self._documents: dict[int, list[str]] = {}
        self._text_processor = text_processor
        self.k1 = k1
        self.b = b
        self._n_gram_size = n_gram_size

    def index_documents(self, documents_processed: list[Document]) -> None:
        for doc in documents_processed:
            self.index_document(doc.doc_id, doc.name)

    def index_document(self, doc_id: int, name: str) -> None:
        doc_name_processed = self._text_processor.process_text(name)
        ngrams = self._generate_ngrams(doc_name_processed)
        self._documents[doc_id] = ngrams
        for ngram in ngrams:
            self._index[ngram][doc_id] += 1

    def _generate_ngrams(self, name: str) -> list[str]:
        ngrams = []
        for i in range(len(name) - self._n_gram_size + 1):
            ngrams.append(name[i : i + self._n_gram_size])
        return ngrams
    
    @cached_property
    def number_of_documents(self) -> int:
        return len(self._documents)

    @cached_property
    def avdl(self) -> float:
        return sum(len(doc) for doc in self._documents.values()) / self.number_of_documents

    def idf(self, query_token: str) -> float:
        N = self.number_of_documents
        n_query_token_docs = len(self._index[query_token])
        return log((N - n_query_token_docs + 0.5) / (n_query_token_docs + 0.5) + 1)

    def _get_query_token_document_scores(self, query_token: str) -> dict[int, float]:
        idf = self.idf(query_token)
        return {
            doc_id: (self.k1 + 1) * tf / (tf + self.k1 * (1 - self.b + self.b * len(self._documents[doc_id]) / self.avdl)) * idf
            for doc_id, tf in self._index[query_token].items()
        }
    
    def search(self, query: str, limit: Optional[int] = None) -> dict[int, float]:
        query_processed = self._text_processor.process_text(query)
        query_ngrams = self._generate_ngrams(query_processed)
        product_scores: dict[int, float] = {}
        for query_ngram in query_ngrams:
            document_scores = self._get_query_token_document_scores(query_ngram)
            product_scores = self.update_product_scores(product_scores, document_scores)
        return [x[0] for x in sorted(product_scores.items(), key=lambda x: x[1], reverse=True)][:limit]
        
    def update_product_scores(self, old: dict[int, float], new: dict[int, float]) -> dict[int, float]:
        for doc_id, score in new.items():
            if doc_id in old:
                old[doc_id] += score
            else:
                old[doc_id] = score
        return old


In [64]:
class SimpleTextProcessor(TextProcessor):
    def process_text(self, text: str) -> str:
        text = self.lowercase_text(text)
        text = self.replace_symbols(text)
        return self.process_punctuation_simple(text)

simple_text_processor = SimpleTextProcessor()

In [76]:
search_engine_ngram_bm25 = SearchEngineNgramBm25(simple_text_processor, k1=1.2, b=0.75, n_gram_size=2)

In [77]:
search_engine_ngram_bm25.index_documents(documents)

In [67]:
[print_ozon_link(x) for x in search_engine_ngram_bm25.search("кассеты для бритвы мужские", limit=30)]

https://ozon.ru/product/465164863
https://ozon.ru/product/835428859
https://ozon.ru/product/482126805
https://ozon.ru/product/591928165
https://ozon.ru/product/680978604
https://ozon.ru/product/620047873
https://ozon.ru/product/620046931
https://ozon.ru/product/746671114
https://ozon.ru/product/168165795
https://ozon.ru/product/630691239
https://ozon.ru/product/217068283
https://ozon.ru/product/161674518
https://ozon.ru/product/517209910
https://ozon.ru/product/168894035
https://ozon.ru/product/267981985
https://ozon.ru/product/184573224
https://ozon.ru/product/640910116
https://ozon.ru/product/164214504
https://ozon.ru/product/4788808
https://ozon.ru/product/144505497
https://ozon.ru/product/669972122
https://ozon.ru/product/669985694
https://ozon.ru/product/669970989
https://ozon.ru/product/670058352
https://ozon.ru/product/670058897
https://ozon.ru/product/670058654
https://ozon.ru/product/161674517
https://ozon.ru/product/4788809
https://ozon.ru/product/144505500
https://ozon.ru/pr

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [68]:
[print_ozon_link(x) for x in search_engine_bm25.search("кассеты для бритвы мужские", limit=30)]

https://ozon.ru/product/482126805
https://ozon.ru/product/626512687
https://ozon.ru/product/356922446
https://ozon.ru/product/465164863
https://ozon.ru/product/835428859
https://ozon.ru/product/517209910
https://ozon.ru/product/217068283
https://ozon.ru/product/168165795
https://ozon.ru/product/231812359
https://ozon.ru/product/640910116
https://ozon.ru/product/184573224
https://ozon.ru/product/580637093
https://ozon.ru/product/216665672
https://ozon.ru/product/746671114
https://ozon.ru/product/164214504
https://ozon.ru/product/4788808
https://ozon.ru/product/620047873
https://ozon.ru/product/620046931
https://ozon.ru/product/157424893
https://ozon.ru/product/5543624
https://ozon.ru/product/138423967
https://ozon.ru/product/239779174
https://ozon.ru/product/4788809
https://ozon.ru/product/4789070
https://ozon.ru/product/1462804333
https://ozon.ru/product/162865256
https://ozon.ru/product/1352518845
https://ozon.ru/product/32858184
https://ozon.ru/product/164214452
https://ozon.ru/produ

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [69]:
[print_ozon_link(x) for x in search_engine_ngram_bm25.search("косеты для бритвы мужские", limit=30)]


https://ozon.ru/product/465164863
https://ozon.ru/product/835428859
https://ozon.ru/product/482126805
https://ozon.ru/product/620047873
https://ozon.ru/product/620046931
https://ozon.ru/product/680978604
https://ozon.ru/product/591928165
https://ozon.ru/product/746671114
https://ozon.ru/product/168165795
https://ozon.ru/product/217068283
https://ozon.ru/product/630691239
https://ozon.ru/product/669972122
https://ozon.ru/product/669985694
https://ozon.ru/product/669970989
https://ozon.ru/product/670058352
https://ozon.ru/product/267981985
https://ozon.ru/product/184573224
https://ozon.ru/product/670058897
https://ozon.ru/product/670058654
https://ozon.ru/product/640910116
https://ozon.ru/product/161674518
https://ozon.ru/product/517209910
https://ozon.ru/product/4788809
https://ozon.ru/product/144505500
https://ozon.ru/product/4788804
https://ozon.ru/product/164214502
https://ozon.ru/product/162865257
https://ozon.ru/product/4788805
https://ozon.ru/product/162865256
https://ozon.ru/prod

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [70]:
[print_ozon_link(x) for x in search_engine_bm25.search("косеты для бритвы мужские", limit=30)]


https://ozon.ru/product/356922446
https://ozon.ru/product/1387337779
https://ozon.ru/product/648028268
https://ozon.ru/product/1340978156
https://ozon.ru/product/382216383
https://ozon.ru/product/928514787
https://ozon.ru/product/1528317517
https://ozon.ru/product/1496118414
https://ozon.ru/product/1572952364
https://ozon.ru/product/952023288
https://ozon.ru/product/1192905067
https://ozon.ru/product/1562255075
https://ozon.ru/product/482126805
https://ozon.ru/product/420666685
https://ozon.ru/product/626512687
https://ozon.ru/product/422743978
https://ozon.ru/product/790115501
https://ozon.ru/product/208009731
https://ozon.ru/product/714380855
https://ozon.ru/product/1416463253
https://ozon.ru/product/1048144951
https://ozon.ru/product/465164863
https://ozon.ru/product/847166922
https://ozon.ru/product/1344582380
https://ozon.ru/product/835428859
https://ozon.ru/product/683899493
https://ozon.ru/product/208009732
https://ozon.ru/product/1144916855
https://ozon.ru/product/226988572
htt

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [71]:
[print_ozon_link(x) for x in search_engine_bm25.search("халодильник", limit=30)]


[]

In [72]:
[print_ozon_link(x) for x in search_engine_ngram_bm25.search("халодильник", limit=30)]


https://ozon.ru/product/209997102
https://ozon.ru/product/1576425147
https://ozon.ru/product/1457779141
https://ozon.ru/product/1444369749
https://ozon.ru/product/193470153
https://ozon.ru/product/170075383
https://ozon.ru/product/193470154
https://ozon.ru/product/182451989
https://ozon.ru/product/182451991
https://ozon.ru/product/154475460
https://ozon.ru/product/266791724
https://ozon.ru/product/1075792425
https://ozon.ru/product/170075379
https://ozon.ru/product/914168984
https://ozon.ru/product/154072681
https://ozon.ru/product/266791176
https://ozon.ru/product/914170197
https://ozon.ru/product/1074359967
https://ozon.ru/product/170075380
https://ozon.ru/product/170075385
https://ozon.ru/product/170075384
https://ozon.ru/product/170075378
https://ozon.ru/product/154474648
https://ozon.ru/product/170075382
https://ozon.ru/product/154073300
https://ozon.ru/product/170075381
https://ozon.ru/product/914170194
https://ozon.ru/product/415182919
https://ozon.ru/product/415173072
https://o

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [73]:
[print_ozon_link(x) for x in search_engine_bm25.search("новушники", limit=30)]


[]

In [74]:
[print_ozon_link(x) for x in search_engine_ngram_bm25.search("новушники", limit=30)]


https://ozon.ru/product/220484272
https://ozon.ru/product/544315175
https://ozon.ru/product/202774783
https://ozon.ru/product/1549417381
https://ozon.ru/product/645008371
https://ozon.ru/product/505543662
https://ozon.ru/product/793717993
https://ozon.ru/product/254052997
https://ozon.ru/product/254070112
https://ozon.ru/product/294968666
https://ozon.ru/product/294969778
https://ozon.ru/product/254051714
https://ozon.ru/product/190759201
https://ozon.ru/product/294968551
https://ozon.ru/product/294969794
https://ozon.ru/product/190759208
https://ozon.ru/product/779662322
https://ozon.ru/product/407158738
https://ozon.ru/product/589737338
https://ozon.ru/product/1540354338
https://ozon.ru/product/698850642
https://ozon.ru/product/465680921
https://ozon.ru/product/1380001456
https://ozon.ru/product/273220354
https://ozon.ru/product/1643701055
https://ozon.ru/product/589776489
https://ozon.ru/product/964575571
https://ozon.ru/product/310586499
https://ozon.ru/product/701052713
https://oz

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [78]:
calculate_validation_metrics(search_engine_ngram_bm25.search, limit=200)

precision = 0.142125
recall = 0.3460049592552592
f1_score = 0.1933477681308207