In [1]:
# !pip install -U sentence-transformers

In [2]:
from datasets import load_dataset, DatasetDict
import sentence_transformers
import sentence_transformers.cross_encoder.evaluation
from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample # High-level sentence encoders.
import sentence_transformers.models as models
import sentence_transformers.losses as losses
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm # Enables progress bars
import pandas as pd

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

QUICK_RUN = False # Config setting to switch between foreground (subset) and background (full-dataset) running

In [3]:
queries = load_dataset("BeIR/scidocs", "queries", split="queries")
docs = load_dataset("BeIR/scidocs", "corpus", split="corpus")
qrels = load_dataset("BeIR/scidocs-qrels", delimiter="\t", split="test")

In [4]:
print(len(queries))
print(len(docs))
print(len(qrels))
print(len(set(qrels["query-id"])))
print(len(set(qrels["corpus-id"])))

1000
25657
29928
1000
25657


In [5]:
queries, docs, qrels

(Dataset({
     features: ['_id', 'title', 'text'],
     num_rows: 1000
 }),
 Dataset({
     features: ['_id', 'title', 'text'],
     num_rows: 25657
 }),
 Dataset({
     features: ['query-id', 'corpus-id', 'score'],
     num_rows: 29928
 }))

In [6]:
# For demonstration purposes only
if QUICK_RUN:
    queries = queries.select(range(100))
    docs = docs.select(range(2500))
    qrels = qrels.filter(lambda x: x["query-id"] in queries["_id"] and x["corpus-id"] in docs["_id"])

In [7]:
# 90% train, 10% test + validation
train_testvalid = qrels.train_test_split(test_size=0.1, seed=1)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=1)

# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']  # Using the training split from the test_valid split as validation
})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 26935
    })
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1497
    })
    valid: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1496
    })
})

In [8]:
def get_triple_for_example(example):
    q = queries[queries["_id"].index(example["query-id"])]["text"]
    d = docs[docs["_id"].index(example["corpus-id"])]["title"]
    r = example["score"]
    return q, d, r

ex0 = get_triple_for_example(train_test_valid_dataset["test"][0])
ex1 = get_triple_for_example(train_test_valid_dataset["test"][1])
ex0, ex1

(('Provable data possession at untrusted stores',
  'StreamOp: An Innovative Middleware for Supporting Data Management and Query Functionalities over Sensor Network Streams Efficiently',
  0),
 ('Rumor Detection and Classification for Twitter Data',
  'Thumbs Up or Thumbs Down? Semantic Orientation Applied to Unsupervised Classification of Reviews',
  1))

In [9]:
from collections import Counter
from scipy import stats

# From Huggingface Evaluate
def label_dist(data):
    """Returns the fraction of each label present in the data"""
    c = Counter(data)
    label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]}
    
    if isinstance(data[0], str):
        label2id = {label: id for id, label in enumerate(label_distribution["labels"])}
        data = [label2id[d] for d in data]
    
    skew = stats.skew(data)
    
    return {"label_distribution": label_distribution, "label_skew": skew}

print(label_dist(data=train_test_valid_dataset["train"]["score"]))
print(label_dist(data=train_test_valid_dataset["valid"]["score"]))
print(label_dist(data=train_test_valid_dataset["test"]["score"]))

{'label_distribution': {'labels': [1, 0], 'fractions': [0.16461852608130684, 0.8353814739186931]}, 'label_skew': 1.8087864265977875}
{'label_distribution': {'labels': [0, 1], 'fractions': [0.8348930481283422, 0.16510695187165775]}, 'label_skew': 1.8040061996868444}
{'label_distribution': {'labels': [0, 1], 'fractions': [0.8350033400133601, 0.16499665998663995]}, 'label_skew': 1.8050841379113802}


In [10]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

In [11]:
docs.map(lambda x: {"title_text": x["title"] + ": " + x["text"]})["title_text"][:2]

['A hybrid of genetic algorithm and particle swarm optimization for recurrent network design: An evolutionary recurrent network which automates the design of recurrent neural/fuzzy networks using a new evolutionary learning algorithm is proposed in this paper. This new evolutionary learning algorithm is based on a hybrid of genetic algorithm (GA) and particle swarm optimization (PSO), and is thus called HGAPSO. In HGAPSO, individuals in a new generation are created, not only by crossover and mutation operation as in GA, but also by PSO. The concept of elite strategy is adopted in HGAPSO, where the upper-half of the best-performing individuals in a population are regarded as elites. However, instead of being reproduced directly to the next generation, these elites are first enhanced. The group constituted by the elites is regarded as a swarm, and each elite corresponds to a particle within it. In this regard, the elites are enhanced by PSO, an operation which mimics the maturing phenome

In [12]:
# !pip install -U bertopic

In [14]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
import plotly

In [15]:
docs_for_analysis = docs.map(lambda x: {"title_text": x["title"] + ": " + x["text"]})["title_text"]

In [16]:
topic_model = BERTopic(embedding_model=model_name, ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True))
topic_model.fit(docs_for_analysis)

topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,8835,-1_object_scene_objects_cognitive,"[object, scene, objects, cognitive, health, vi...",[Is Verbal Irony Special?: The way we speak ca...
1,0,330,0_antenna_patch_radiation_polarized,"[antenna, patch, radiation, polarized, microst...",[Low-Cost High-Gain and Broadband Substrate- I...
2,1,318,1_encryption_encrypted_cipher_cryptographic,"[encryption, encrypted, cipher, cryptographic,...",[Ring-LWE Ciphertext Compression and Error Cor...
3,2,231,2_reinforcement_rl_policy_reward,"[reinforcement, rl, policy, reward, agent, cri...",[Deep Reinforcement Learning framework for Aut...
4,3,215,3_sentiment_opinion_polarity_opinions,"[sentiment, opinion, polarity, opinions, sarca...",[Domain Specific Sentence Level Mood Extractio...


In [17]:
topic_model.reduce_topics(docs_for_analysis, nr_topics=15)
fig = topic_model.visualize_documents(docs_for_analysis)
plotly.offline.plot(fig, filename='bertopic_doc_embeddings.html')

'bertopic_doc_embeddings.html'

In [18]:
from IPython.display import IFrame
IFrame(src='bertopic_doc_embeddings.html', width=1200, height=800)

In [19]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

# Tokenizer and model must match
ex_tokenizer = AutoTokenizer.from_pretrained(model_name)
ex_model = AutoModel.from_pretrained(model_name)
ex_model_with_head = AutoModelForSequenceClassification.from_pretrained(model_name)  # Needs fine-tuning, here for demonstration

test_sentences = ["This is the first sentence with complex tokens, such as SentenceTransformers.", "We can batch multiple sentences."]
ex_tokenized = ex_tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)  # Collates data with padding
ex_res = ex_model(**ex_tokenized)
ex_res_with_head = ex_model_with_head(**ex_tokenized)

print("\nTokenized text:")  # Word Piece Tokenization
print(ex_tokenizer.tokenize(test_sentences))

print("\nToken IDs:")
print(ex_tokenized)

print("\nOutput Dictionary:")
print(ex_res.keys())

print("\nOutput Size:")
print(ex_res.last_hidden_state.size())

print("\nContextualized Token Embeddings (truncated):")
print(ex_res.last_hidden_state[:, :3, :7])  # First 3 tokens

print("\nPooled Embeddings (truncated):")
print(ex_res.pooler_output.shape, ex_res.pooler_output[:, :7])

print("\nPredicted Values (not fine-tuning)")
print(ex_res_with_head)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Tokenized text:
['this', 'is', 'the', 'first', 'sentence', 'with', 'complex', 'token', '##s', ',', 'such', 'as', 'sentence', '##tra', '##ns', '##form', '##ers', '.', 'we', 'can', 'batch', 'multiple', 'sentences', '.']

Token IDs:
{'input_ids': tensor([[  101,  2023,  2003,  1996,  2034,  6251,  2007,  3375, 19204,  2015,
          1010,  2107,  2004,  6251,  6494,  3619, 14192,  2545,  1012,   102],
        [  101,  2057,  2064, 14108,  3674, 11746,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

Output Dictionary:
odict_keys(['last_hidden_state', 'pooler_output'])

Output Size:
torch.Size([2, 20, 384])

Cont

In [20]:
# Uses Mean pooling
topic_model.embedding_model.embedding_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [21]:
# Starts with embeddings
topic_model.embedding_model.embedding_model[0]._modules["auto_model"]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [30]:
from collections import defaultdict
from torch.utils.data import Dataset
from transformers import InputExample
 
class IRDataset(Dataset):
    def __init__(self, queries_ds, docs_ds, qrel_ds, mode="cross"):
        self.mode = mode
        qrels = defaultdict(set)
 
        def transform(x):
            q, d, r = x["query-id"], x["corpus-id"], x["score"]
            q_idx = queries_ds["_id"].index(q)
            x["query_text"] = queries_ds[q_idx]["text"]
            d_idx = docs_ds["_id"].index(d)
            x["doc_content"] = docs_ds[d_idx]["title"] + ": " + docs_ds[d_idx]["text"]
            x["label"] = float(r)
            if r:
                qrels[q].add(d)
            return x
 
        qrel_ds = qrel_ds.map(transform)
        self.q_ids = qrel_ds["query-id"]
        self.d_ids = qrel_ds["corpus-id"]
        self.qrels = qrels
        self.queries = qrel_ds["query_text"]
        self.docs = qrel_ds["doc_content"]
        self.labels = qrel_ds["label"]
 
    def __getitem__(self, idx):
        qs = self.queries[idx]
        ds = self.docs[idx]
        guid = f"{self.q_ids[idx]}_{self.d_ids[idx]}"  # Generating a unique identifier
    
        if self.mode == "rep":
            if type(idx) is int:
                text_list = [{"query": qs}, {"doc": ds}]
            else:
                text_list = [[{"query": q} for q in qs], [{"doc": d} for d in ds]]
            return InputExample(guid=guid, text_a=qs, text_b=ds, label=self.labels[idx])
        return InputExample(guid=guid, text_a=qs, text_b=ds, label=self.labels[idx])
 
    def set_mode(self, mode):
        self.mode = mode
 
    def __len__(self):
        return len(self.labels)

In [31]:
train_ds = IRDataset(queries, docs, train_test_valid_dataset["train"])
valid_ds = IRDataset(queries, docs, train_test_valid_dataset["valid"])
train_ds[0].__dict__

{'guid': 'eecbbb0ba7b513a2fe1e7a0131213e5a94b1868a_fb0031e4d2a7358fca04da94c5d7e52da794fe34',
 'text_a': 'Toward an IT governance maturity self-assessment model using EFQM and CobiT',
 'text_b': 'A maturity model for information governance: Information Governance (IG) as defined by Gartner is the “specification of decision rights and an accountability framework to encourage desirable behavior in the valuation, creation, storage, use, archival and deletion of information. Includes the processes, roles, standards and metrics that ensure the effective and efficient use of information in enabling an organization to achieve its goals”. In this paper, we present how to create an IG maturity model based on existing reference documents. The process is based on existing maturity model development methods. These methods allow for a systematic approach to maturity model development backed up by a well-known and proved scientific research method called Design Science Research. Then, based on the m

In [32]:
monoBERT = CrossEncoder(model_name, # We use cross-encoder as monoBERT example
                        num_labels=1, # Perform binary classification
                        device=None, # Will use CUDA if available
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
monoBERT.predict([ex0[:2], ex1[:2]])

array([0.49718797, 0.48692062], dtype=float32)

In [34]:
print(train_ds[0])

InputExample(guid='eecbbb0ba7b513a2fe1e7a0131213e5a94b1868a_fb0031e4d2a7358fca04da94c5d7e52da794fe34', text_a='Toward an IT governance maturity self-assessment model using EFQM and CobiT', text_b='A maturity model for information governance: Information Governance (IG) as defined by Gartner is the “specification of decision rights and an accountability framework to encourage desirable behavior in the valuation, creation, storage, use, archival and deletion of information. Includes the processes, roles, standards and metrics that ensure the effective and efficient use of information in enabling an organization to achieve its goals”. In this paper, we present how to create an IG maturity model based on existing reference documents. The process is based on existing maturity model development methods. These methods allow for a systematic approach to maturity model development backed up by a well-known and proved scientific research method called Design Science Research. Then, based on the 

In [35]:
train_dl = DataLoader(train_ds, batch_size=32)
# We need sentence pairs format for the library here.
# valid_dl = DataLoader(valid_ds, batch_size=32)
sentence_pairs = list(zip(valid_ds.queries, valid_ds.docs))
labels = valid_ds.labels
len(train_dl)

842

In [36]:
monoBERT.__dict__.keys()

dict_keys(['config', 'model', 'tokenizer', 'max_length', '_target_device', 'default_activation_function'])

In [37]:
repBased = SentenceTransformer(model_name)

In [40]:
qs, ds = repBased.encode([{"query": ex0[0]}, {"query": ex1[0]}]), repBased.encode([{"doc": ex0[1]}, {"doc": ex1[0]}])
sentence_transformers.util.cos_sim(qs, ds)

tensor([[0.1819, 0.1325],
        [0.0353, 1.0000]])

In [47]:
train_ds.set_mode("rep")
valid_ds.set_mode("rep")
train_dl_repBased = DataLoader(train_ds, batch_size=32, collate_fn=repBased.smart_batching_collate)
valid_dl_repBased = DataLoader(valid_ds, batch_size=32, collate_fn=repBased.smart_batching_collate)
# assert next(iter(train_dl_repBased))

In [48]:
queries_dict = dict(zip(valid_ds.q_ids, valid_ds.queries))
docs_dict = dict(zip(valid_ds.d_ids, valid_ds.docs))
qrels_dict = valid_ds.qrels

In [52]:
ir_evaluator = sentence_transformers.evaluation.InformationRetrievalEvaluator(queries_dict, docs_dict, qrels_dict, write_csv=True)

repBased.fit(
    train_objectives=[(train_dl_repBased, losses.CosineSimilarityLoss(repBased))],
    evaluator=ir_evaluator,
    epochs=10,
    optimizer_class=torch.optim.AdamW,
    show_progress_bar=True,
    save_best_model=True,
    output_path="./",
)

In [53]:
qs, ds = repBased.encode([{"query": ex0[0]}, {"query": ex1[0]}]), repBased.encode([{"doc": ex0[1]}, {"doc": ex1[0]}])
sentence_transformers.util.cos_sim(qs, ds)

tensor([[0.1819, 0.1325],
        [0.0353, 1.0000]])

In [56]:
df = pd.read_csv("eval/Information-Retrieval_evaluation_results.csv")
df.tail(n=10)

df.set_index("epoch").drop(columns=["steps"]).plot(legend=False)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=3)