In [1]:
# required libraries when working in Google Colab
!pip install datasets
!pip install rank_bm25
!pip install pymorphy3
!pip install pymorphy3-dicts-ru
!pip install kaleido



In [2]:
# save files to Google Drive
import os
import sys
from google.colab import drive
drive.mount('/content/drive')

# required: folder with project files path
files_path = '/content/drive/MyDrive/diploma' # your path

sys.path.append(files_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Retrievers realization module
from Retrievers import TransformerRetriever, LexicalRetriever, HybridScorer

# quality of models: recall calculation and grid search by weights in hybrid method
from model_quality import make_recall_function, grid_search_weights

# grid search results plotting
from plotting import plot_3d_recall

# methods that make it easier to compare models
from pipline_functions import ISL, calculate_load_grid_pipline, stat_comparasion

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data loading

In [4]:
from datasets import load_dataset

ds = load_dataset("deepvk/ru-HNP") # data loading
data = ds["test"] # in this paper we use test part of the dataset

queries = [row["query"] for row in data]
positives = [row["pos"] for row in data] # positive examples for a query
negatives = [row["neg"] for row in data] # negative examples for a query

corpus = [] # single corpus of all positives and negatives
corpus_id_map = []  # query prototype index -> pos / neg

for idx, row in enumerate(data):
    for pos in row["pos"]:
        corpus.append(pos)
        corpus_id_map.append((idx, "pos"))
    for neg in row["neg"]:
        corpus.append(neg)
        corpus_id_map.append((idx, "neg"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
example_num = -3

print('Query example:')
print(queries[example_num])

print('\nPositive results for the query example:')
for item in positives[example_num]:
  print(item)

Query example:
Пермь II (до 1909 года - 'Заимки', до 1911 года - 'Пермь-Заимки') - железнодорожная станция Пермского региона Свердловской железной дороги, находящаяся в городе Перми, административном центре Пермского края.

Positive results for the query example:
Железнодорожная станция Пермь II (ранее известная как 'Заимки' до 1909 года и 'Пермь-Заимки' до 1911 года) расположена в городе Перми, который является административным центром Пермского края.
В городе Перми находится железнодорожная станция Пермь II, изначально названная 'Заимки' до 1909 года, а затем 'Пермь-Заимки' до 1911 года.
Пермь II - это название железнодорожной станции в городе Перми, которая ранее была известна как 'Заимки' до 1909 года и 'Пермь-Заимки' до 1911 года.
Железнодорожная станция Пермь II расположена в Перми, административном центре Пермского края, и ранее носила название 'Заимки' до 1909 года и 'Пермь-Заимки' до 1911 года.
В Перми находится железнодорожная станция Пермь II, которая была известна как 'Заим

### Lexical and Transformer Models
Lexical models:
* BM25
* TF-IDF

Transformers (adapted to Russian or multilingual):
*   [DeepPavlov/rubert-base-cased](https://huggingface.co/DeepPavlov/rubert-base-cased)
*   [deepvk/RuModernBERT-base](https://huggingface.co/deepvk/RuModernBERT-base)
*   [cointegrated/rubert-tiny2](https://huggingface.co/cointegrated/rubert-tiny2)
*   [sergeyzh/rubert-tiny-turbo](https://huggingface.co/sergeyzh/rubert-tiny-turbo)
*   [google-bert/bert-base-multilingual-cased](https://huggingface.co/google-bert/bert-base-multilingual-cased)
*   [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2)
*   [ai-forever/sbert_large_nlu_ru](https://huggingface.co/ai-forever/sbert_large_nlu_ru)


In [6]:
model_names = [
    #BERT
    'DeepPavlov/rubert-base-cased'
    ,'deepvk/RuModernBERT-base'
    ,'cointegrated/rubert-tiny2'
    ,'sergeyzh/rubert-tiny-turbo'
    ,'google-bert/bert-base-multilingual-cased'
    #SBERT
    ,'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
    ,'ai-forever/sbert_large_nlu_ru'
  ]

# we create a dict in which there is a dict
# with the necessary parameters of the model for research

meta_dict = {
    model_name: {} for model_name in model_names
}


In [7]:
# serialize or load embeddings for lexical methods
lexical = LexicalRetriever(use_bm25=True, use_tfidf=True)
lexical._load_embeddings(file_path = files_path)

print("\nCaching queries for Lexical Retrievers...")
lexical.cache_query_scores(queries)

for model_name in model_names:
  meta_dict[model_name]['model'] = ISL(model_name, files_path = files_path,
                                       corpus = corpus, queries = queries,
                                       serialize_flg = False, load_flg = True)



Caching queries for Lexical Retrievers...


Lexical scoring:   0%|          | 0/2000 [00:00<?, ?it/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading corpus embeddings for DeepPavlov/rubert-base-cased...
DeepPavlov/rubert-base-cased embeddings loaded

Loading corpus embeddings for deepvk/RuModernBERT-base...
deepvk/RuModernBERT-base embeddings loaded

Loading corpus embeddings for cointegrated/rubert-tiny2...
cointegrated/rubert-tiny2 embeddings loaded

Loading corpus embeddings for sergeyzh/rubert-tiny-turbo...
sergeyzh/rubert-tiny-turbo embeddings loaded

Loading corpus embeddings for google-bert/bert-base-multilingual-cased...
google-bert/bert-base-multilingual-cased embeddings loaded

Loading corpus embeddings for sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2...
sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 embeddings loaded

Loading corpus embeddings for ai-forever/sbert_large_nlu_ru...
ai-forever/sbert_large_nlu_ru embeddings loaded



In [8]:
# recall function for each model
for model_name in model_names:
  meta_dict[model_name]['recall function'] = make_recall_function(
      transformer = meta_dict[model_name]['model'],
      transformer_embeds = meta_dict[model_name]['model'].get_corpus_embeddings(),
      queries = queries,
      corpus_id_map = corpus_id_map,
      lexical_retriever = lexical,
      k=5
    )


Precomputing scores for DeepPavlov/rubert-base-cased...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for deepvk/RuModernBERT-base...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for cointegrated/rubert-tiny2...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for sergeyzh/rubert-tiny-turbo...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for google-bert/bert-base-multilingual-cased...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

Precomputing scores for ai-forever/sbert_large_nlu_ru...


Precomputing:   0%|          | 0/2000 [00:00<?, ?it/s]

In [10]:
# calculating or loading grid search results into a dict
grid = .02
n_jobs = -1

for model_name in model_names:
  meta_dict[model_name]['grid search results'] = calculate_load_grid_pipline(
          model_name = model_name, files_path = files_path,
          grid = grid, n_jobs = n_jobs,
          calculate_flg = False, load_flg = True
      )


In [11]:
# statistical significance of metric gains with better weights
for model_name in model_names:
  meta_dict[model_name]['statistical comparasion'] = stat_comparasion(
          result_df = meta_dict[model_name]['grid search results'],
          recall_func = meta_dict[model_name]['recall function'],
          alpha = 0, beta = 0, gamma = 0
        )


In [13]:
import pandas as pd

# table with results

alpha_significance_level = .01 # for comparasion Recall@5: best weights vs baseline

results_df = pd.DataFrame(
    data = {
        'model name': [
            model_name.split('/')[1] for model_name in model_names
        ]
        ,'best alpha (BM25)': [
            meta_dict[model_name]['grid search results'].sort_values(by = 'recall@5', ascending = False).iloc[0]['alpha (BM25)']
            for model_name in meta_dict
        ]
        ,'best beta (TF-IDF)': [
            meta_dict[model_name]['grid search results'].sort_values(by = 'recall@5', ascending = False).iloc[0]['beta (TF-IDF)']
            for model_name in meta_dict
        ]
        ,'best gamma (Transformer)': [
            meta_dict[model_name]['grid search results'].sort_values(by = 'recall@5', ascending = False).iloc[0]['gamma (Transformer)']
            for model_name in meta_dict
        ]
        ,'best recall@5': [
            meta_dict[model_name]['grid search results'].sort_values(by = 'recall@5', ascending = False).iloc[0]['recall@5']
            for model_name in meta_dict
        ]
        ,'baseline recall@5': [
            meta_dict[model_name]['grid search results'][
                meta_dict[model_name]['grid search results']['gamma (Transformer)'] == 1 #transformer only
            ]['recall@5'].item()
            for model_name in meta_dict
        ]
        ,'delta': [
            meta_dict[model_name]['grid search results'].sort_values(by = 'recall@5', ascending = False).iloc[0]['recall@5'] -
            meta_dict[model_name]['grid search results'][
                meta_dict[model_name]['grid search results']['gamma (Transformer)'] == 1 # vs transformer only
            ]['recall@5'].item()
            for model_name in meta_dict
        ]
        ,'p-value (U-test)': [
            1 if isinstance(meta_dict[model_name]['statistical comparasion'], str)
            else meta_dict[model_name]['statistical comparasion'].pvalue
            for model_name in meta_dict
        ]
    }
)

results_df['is significant'] = results_df['p-value (U-test)'].apply(
    lambda x: 'yes' if x < alpha_significance_level else 'no'
  )

results_df.index = results_df.index + 1

results_df.to_csv(files_path + '/experiments_results/' + 'stat_test_results.csv', index = None)

results_df


Unnamed: 0,model name,best alpha (BM25),best beta (TF-IDF),best gamma (Transformer),best recall@5,baseline recall@5,delta,p-value (U-test),is significant
1,rubert-base-cased,0.0,0.24,0.76,0.7399,0.7208,0.0191,1.112593e-06,yes
2,RuModernBERT-base,0.02,0.04,0.94,0.7266,0.7161,0.0105,0.0002905122,yes
3,rubert-tiny2,0.0,0.06,0.94,0.7741,0.7719,0.0022,0.2850719,no
4,rubert-tiny-turbo,0.0,0.0,1.0,0.9094,0.9094,0.0,1.0,no
5,bert-base-multilingual-cased,0.08,0.18,0.74,0.7305,0.7035,0.027,3.659564e-08,yes
6,paraphrase-multilingual-MiniLM-L12-v2,0.0,0.0,1.0,0.9245,0.9245,0.0,1.0,no
7,sbert_large_nlu_ru,0.16,0.0,0.84,0.7534,0.7115,0.0419,1.2479139999999999e-26,yes


In [14]:
# Recall@5 for baseline: lexical only or transformer only

grid_search_df = pd.concat(
    [meta_dict[model_name]['grid search results'] for model_name in model_names]
  )

table_1_df = grid_search_df[
    ((grid_search_df['model'] == 'DeepPavlov/rubert-base-cased') & (grid_search_df['alpha (BM25)'] == 1))
    | ((grid_search_df['model'] == 'DeepPavlov/rubert-base-cased') & (grid_search_df['beta (TF-IDF)'] == 1))
    | (grid_search_df['gamma (Transformer)'] == 1)
]

def determine_model_name(row):
    if row['alpha (BM25)'] == 1:
        return 'BM25'
    elif row['beta (TF-IDF)'] == 1:
        return 'TF-IDF'
    elif row['gamma (Transformer)'] == 1:
        return row['model']
    else:
        return None

table_1_df['model_name'] = table_1_df.apply(determine_model_name, axis=1)

print('Recall@5 for lexical and transformer methods.')

table_1_df[['model_name', 'recall@5']].sort_values(by = 'recall@5')


Recall@5 for lexical and transformer methods.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table_1_df['model_name'] = table_1_df.apply(determine_model_name, axis=1)


Unnamed: 0,model_name,recall@5
0,google_bert,0.7035
0,sbert_nlu_ru,0.7115
0,ru_modern_bert_base,0.7161
0,bert,0.7208
0,rubert_tiny2,0.7719
0,rubert_tiny_turbo,0.9094
0,sbert,0.9245


In [15]:
# Recall@5 comparasion with Wilcoxon test: lexical vs transformer

table_2_df = pd.DataFrame(data = {'model':[], 'BM25':[], 'TF-IDF':[]})


for model_name in model_names:
  bm25_p_value = stat_comparasion(
      result_df = meta_dict[model_name]['grid search results'],
      recall_func = meta_dict[model_name]['recall function'],
      alpha = 1, beta = 0, gamma = 0 # comparison with BM25
  )[1] #p-value

  tf_idf_p_value = stat_comparasion(
      result_df = meta_dict[model_name]['grid search results'],
      recall_func = meta_dict[model_name]['recall function'],
      alpha = 0, beta = 1, gamma = 0 # comparison with BM25
  )[1] #p-value

  table_2_df = pd.concat([
      table_2_df,
      pd.DataFrame(data = {'model':[model_name], 'BM25':[bm25_p_value], 'TF-IDF':[tf_idf_p_value]})
    ])

print('Statistical significance of metric growth in transformer methods.')
print('At the intersection of the row and column is the p-value.')

table_2_df


Statistical significance of metric growth in transformer methods.
At the intersection of the row and column is the p-value.


Unnamed: 0,model,BM25,TF-IDF
0,DeepPavlov/rubert-base-cased,2.556518e-09,3.367369e-11
0,deepvk/RuModernBERT-base,2.094387e-07,4.277356e-09
0,cointegrated/rubert-tiny2,1.827053e-51,8.622898e-56
0,sergeyzh/rubert-tiny-turbo,2.04913e-179,7.3715449999999995e-183
0,google-bert/bert-base-multilingual-cased,0.002065367,0.0001847387
0,sentence-transformers/paraphrase-multilingual-...,5.756545e-197,3.055455e-200
0,ai-forever/sbert_large_nlu_ru,8.459737e-05,1.03166e-05


In [16]:
# Recall@5 with best weights

table_3_df = results_df[results_df.columns[0:-2]]

print('Grid search weights results.')

table_3_df


Grid search weights results.


Unnamed: 0,model name,best alpha (BM25),best beta (TF-IDF),best gamma (Transformer),best recall@5,baseline recall@5,delta
1,rubert-base-cased,0.0,0.24,0.76,0.7399,0.7208,0.0191
2,RuModernBERT-base,0.02,0.04,0.94,0.7266,0.7161,0.0105
3,rubert-tiny2,0.0,0.06,0.94,0.7741,0.7719,0.0022
4,rubert-tiny-turbo,0.0,0.0,1.0,0.9094,0.9094,0.0
5,bert-base-multilingual-cased,0.08,0.18,0.74,0.7305,0.7035,0.027
6,paraphrase-multilingual-MiniLM-L12-v2,0.0,0.0,1.0,0.9245,0.9245,0.0
7,sbert_large_nlu_ru,0.16,0.0,0.84,0.7534,0.7115,0.0419


In [17]:
# significance of gain Recall@5 with best weights
table_4_df = results_df[
    ['model name', 'best gamma (Transformer)', 'best recall@5', 'baseline recall@5', 'delta', 'p-value (U-test)']
  ]

print('Significance of gain Recall@5 with best weights.')

table_4_df


Significance of gain Recall@5 with best weights.


Unnamed: 0,model name,best gamma (Transformer),best recall@5,baseline recall@5,delta,p-value (U-test)
1,rubert-base-cased,0.76,0.7399,0.7208,0.0191,1.112593e-06
2,RuModernBERT-base,0.94,0.7266,0.7161,0.0105,0.0002905122
3,rubert-tiny2,0.94,0.7741,0.7719,0.0022,0.2850719
4,rubert-tiny-turbo,1.0,0.9094,0.9094,0.0,1.0
5,bert-base-multilingual-cased,0.74,0.7305,0.7035,0.027,3.659564e-08
6,paraphrase-multilingual-MiniLM-L12-v2,1.0,0.9245,0.9245,0.0,1.0
7,sbert_large_nlu_ru,0.84,0.7534,0.7115,0.0419,1.2479139999999999e-26


In [19]:
# quality surfaces for alpha, beta, gamma
for model_name in model_names:
  meta_dict[model_name]['grid search results']['model'] = model_name

surface_results_df = pd.concat(
    [meta_dict[model_name]['grid search results'] for model_name in model_names]
  )

plot_3d_recall(surface_results_df, file_path = files_path)
