In [1]:
import sys
sys.path.insert(0, "/home/dzigen/Desktop/ITMO/ВКР/КМУ2024/")

from src.retrievers.e5 import E5Retriever
from src.retrievers.bm25e5 import BM25E5Retriever
from src.retrievers.bm25colbert import BM25ColBertRetriever

import pandas as pd
from tqdm import tqdm

In [2]:
SQUAD_DATASET_DIR = '/home/dzigen/Desktop/ITMO/ВКР/КМУ2024/data/SQuAD'
SQAUD_TRAIN_FILE = f'{SQUAD_DATASET_DIR}/train.csv'
SQAUD_VAL_FILE = f'{SQUAD_DATASET_DIR}/validation.csv'

def prepare_squad():
    squad_train_df = pd.read_csv(SQAUD_TRAIN_FILE, sep=';')
    squad_val_df = pd.read_csv(SQAUD_VAL_FILE, sep=';')
    union_df = pd.concat([squad_train_df, squad_val_df]).reset_index(drop=True)
    union_df = union_df.drop_duplicates(subset=['in_base_index']).reset_index(drop=True)

    texts = []
    metadata = []

    for i in range(union_df.shape[0]):
        texts.append(union_df['context'][i])
        metadata.append({'in_base_index': union_df['in_base_index'][i]})

    return texts, metadata

#### Нумеруем уникальные пасажи в squad-датасете

In [None]:
squad_train_df = pd.read_csv(SQAUD_TRAIN_FILE, sep=';')
squad_val_df = pd.read_csv(SQAUD_VAL_FILE, sep=';')

In [None]:
unique_cntxs = list(set(squad_train_df['context'].to_list()).union(set(squad_val_df['context'].to_list())))

In [None]:
cntxs_idxs = list(range(len(unique_cntxs)))

In [None]:
for part_df in [squad_train_df, squad_val_df]:
    tmp_idxs = []
    for i in tqdm(range(part_df.shape[0])):
        cur_cntx = part_df['context'][i]
        cntx_idx = unique_cntxs.index(cur_cntx)
        tmp_idxs.append(cntx_idx)
    part_df['in_base_index'] = tmp_idxs

In [None]:
squad_train_df.to_csv(SQAUD_TRAIN_FILE, sep=';', index=False)
squad_val_df.to_csv(SQAUD_VAL_FILE, sep=';', index=False)

# For Joint Tuning

#### E5 base (squad)

In [3]:
E5FROZEN_BASE = '../data/bases/e5_squad_base'

In [4]:
retriever = E5Retriever(device='cuda')
texts, metadata = prepare_squad()

Loading E5-model...


In [None]:
retriever.make_base(texts, metadata, E5FROZEN_BASE)

In [5]:
retriever.load_base(E5FROZEN_BASE)

In [7]:
result = retriever.search("query: Who is beyonce?")

Retrieving documents with E5...
Filtering irrelevant document by threshold...


In [8]:
print("scores:")
print(result[0])
print("passages: ")
print(result[1])

scores:
['passage: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'
 'passage: Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé\'s name is a tribute to her mother\'s maiden name. Beyoncé\'s younger sister Solange is al

#### BM25+Colbert base (squad)

In [3]:
BM25COLBERT_BASE = '../data/bases/bm25colbert_squad_base'

In [4]:
retriever = BM25ColBertRetriever()
texts, metadata = prepare_squad()

Loading base ColBERT-model...


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
retriever.make_bm25_base(texts, metadata, BM25COLBERT_BASE)

In [5]:
retriever.load_bm25_base(BM25COLBERT_BASE)

Loading precomputed base...


In [6]:
result = retriever.search("query: Who is beyonce?")

Retrieving documents with BM25...
Re-ranking documents with ColBERT...


100%|██████████| 63/63 [01:51<00:00,  1.77s/it]

Filtering irrelevant document by threshold...





In [7]:
print("scores: ")
print(result[0])
print("passages: ")
print(result[1])

scores: 
['passage: Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé\'s name is a tribute to her mother\'s maiden name. Beyoncé\'s younger sister Solange is also a singer and a former member of Destiny\'s Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.'
 'passage: On the northern outskirts of the city, Crownhill Fort is a well restored example of a "Palmerston\'s Folly". It is owned by the Landmark Trust and is open to the public.'
 "passage: New Haven was the subject of Who Governs? Democracy and Power in An American City, a very influential book in political science by preeminent Yale professor Robert A. Dahl, which incl

#### BM25+E5 base (squad)

In [3]:
BM25E5_BASE = '../data/bases/bm25E5_squad_base'

In [4]:
retriever = BM25E5Retriever()
texts, metadata = prepare_squad()

Loading base E5-model...


In [None]:
retriever.make_bm25_base(texts, metadata, BM25E5_BASE)

In [5]:
retriever.load_bm25_base(BM25E5_BASE)

Loading precomputed base...


In [6]:
result = retriever.search("query: Who is beyonce?")

Retrieving documents with BM25...
Re-ranking documents with E5...


  0%|          | 0/63 [00:00<?, ?it/s]

100%|██████████| 63/63 [01:58<00:00,  1.88s/it]

Filtering irrelevant document by threshold...





In [7]:
print("scores: ")
print(result[0])
print("passages: ")
print(result[1])

scores: 
['passage: Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé\'s name is a tribute to her mother\'s maiden name. Beyoncé\'s younger sister Solange is also a singer and a former member of Destiny\'s Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.'
 "passage: Some biographical accounts include the autobiography Life on the Color Line: The True Story of a White Boy Who Discovered He Was Black by Gregory Howard Williams; One Drop: My Father's Hidden Life—A Story of Race and Family Secrets written by Bliss Broyard about her father Anatole Broyard; the documentary Colored White Boy about a white man in North Carolina who 