In [None]:
!pip install datasets
!pip install transformers
!pip install faiss-gpu

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from datasets import Dataset
import faiss

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

loading = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
).shuffle(seed=25)

ds = []
for example in loading:
  ds.append(example)
  if len(ds) == 1000:
    break

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

In [None]:
ds = pd.DataFrame(ds)
ds.to_csv("dataset.csv", index=False)
dataset = load_dataset("csv", data_files="dataset.csv",split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
columns = dataset.column_names
columns_to_keep = ["wiki_id", "passage_text"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
articles_ds = dataset.remove_columns(columns_to_remove)
print(next(iter(articles_ds)))

columns = dataset.column_names
columns_to_keep = ["wiki_id", "article_title"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
titles_ds = dataset.remove_columns(columns_to_remove)
next(iter(titles_ds))

{'wiki_id': 'Q7673084', 'passage_text': 'was released either in the FOIA or to the media.\nShe was one of two persons who disrobed publicly in protest of the proposed "baggy pants ban" in Atlanta in 2007, creating a media row that helped to stall the legislation in committee.  She is a former employee of The Chamber and The Clermont Lounge.  She is a street medic, a member of the IWW #690 (sex trade workers), a former stripper, a former dominatrix, a bartender, a former fetish club promoter in Atlanta, Georgia with Agoraphobia Productions, a vegan, and web designer.'}


{'wiki_id': 'Q7673084', 'article_title': 'Tabitha Fringe Chase'}

In [None]:
# Model declaration
#model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # Model 1
#model_ckpt = "flax-sentence-embeddings/all_datasets_v3_mpnet-base" # Model 2
model_ckpt = 'sentence-transformers/bert-base-nli-mean-tokens' # Model 3
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model.to(device)
print("Model loaded")

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model loaded


In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
# Create an embeddings dataset
embeddings_dataset = articles_ds.map(
    lambda x: {"embeddings": get_embeddings(x["passage_text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# FAISS for efficient similarity search
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['wiki_id', 'passage_text', 'embeddings'],
    num_rows: 1000
})

In [None]:
c = 0
for i in range(0,1000):
  query_embedding = get_embeddings([titles_ds['article_title'][i]]).cpu().detach().numpy()
  scores, samples = embeddings_dataset.get_nearest_examples("embeddings", query_embedding, k=2)
  samples_df = pd.DataFrame.from_dict(samples)

  samples_df["scores"] = scores
  samples_df.sort_values("scores", ascending=False, inplace=True)

  if samples_df["wiki_id"][0] == embeddings_dataset["wiki_id"][i]:
    c = c + 10
  else:
    if samples_df["wiki_id"][1] == embeddings_dataset["wiki_id"][i]:
      c = c + 5

  samples_df = samples_df.iloc[0:0]

print("Model score: ",c)

Model score:  1335


In [None]:
Model_1 = 9170
Model_2 = 8995
Model_3 = 1335