# Agenda:



## Explore the dataset


In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
#set those vars according to your dataset
query_col = "title"
article_col = "text"

In [3]:
#use if your dataset is in a csv file
#data_path = "" 
#df = pd.read_csv(data_path)
#also split your dataset into train, val, test
#train_df, val_df, test_df = \
#              np.split(df.sample(frac=1, random_state=42), 
#                       [int(.6*len(df)), int(.8*len(df))])

df = load_dataset("csebuetnlp/xlsum", "english")
print(df.keys())
train_df = pd.DataFrame(df["train"])
train_df.drop_duplicates(inplace= True)
val_df = pd.DataFrame(df["validation"])
val_df.drop_duplicates(inplace= True)
test_df = pd.DataFrame(df["test"])
test_df.drop_duplicates(inplace= True)

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/264M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306522 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11535 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11535 [00:00<?, ? examples/s]

dict_keys(['train', 'test', 'validation'])


In [5]:
print(val_df.shape)
val_df.head()

(11535, 5)


Unnamed: 0,id,url,title,summary,text
0,uk-england-hampshire-35233245,https://www.bbc.com/news/uk-england-hampshire-...,Portsmouth sea wall hole repair work delayed,Work to repair a sea wall that collapsed durin...,A large void was temporarily plugged with gran...
1,newsbeat-50388934,https://www.bbc.com/news/newsbeat-50388934,What's happening in Bolivia?,Bolivia's President Evo Morales has now left t...,His resignation on Sunday sparked clashes in c...
2,uk-northern-ireland-48543603,https://www.bbc.com/news/uk-northern-ireland-4...,Two fifths of NI electricity from renewables,Almost 40% of the electricity generated in NI ...,By Conor MacauleyBBC NI Agriculture & Environm...
3,uk-england-suffolk-52175896,https://www.bbc.com/news/uk-england-suffolk-52...,Coronavirus: Suffolk man 'thanks NHS' with 50-...,A man who ran the London Marathon with a tumbl...,Ben Blowes started at dawn on Saturday and too...
4,world-asia-india-49137373,https://www.bbc.com/news/world-asia-india-4913...,"India floods: Over 1,000 train passengers resc...","Indian authorities have rescued 1,050 people f...","Helicopters, boats and diving teams were deplo..."


In [5]:
val_df["query_length"]= val_df[query_col].str.len() 
val_df["article_length"]= val_df[article_col].str.len() 

In [7]:
val_df['query_length'].describe()

count    11535.000000
mean        52.803208
std          9.410136
min         14.000000
25%         48.000000
50%         53.000000
75%         58.000000
max         97.000000
Name: query_length, dtype: float64

In [8]:
val_df['article_length'].describe()

count    11535.000000
mean      2612.896749
std       1567.214988
min        355.000000
25%       1470.500000
50%       2153.000000
75%       3421.500000
max      10107.000000
Name: article_length, dtype: float64

## Getting the embeddings

In [6]:
#choose the dataset that you want to test on
df = val_df

In [7]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [11]:
from transformers import BertTokenizer, BertModel
import torch
#from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers.util import dot_score, cos_sim
from sentence_transformers import SentenceTransformer

In [12]:
def find_match_for_query(q,arts,cosine):
    scores=[]
    for art in arts:
        #print(cosine_similarity([q], [art]))
        if cosine:
            scores.append(cos_sim(q, art))
        else:
            scores.append(dot_score(q, art))
    #print(max(scores),scores.index(max(scores)),scores)
    return scores.index(max(scores))

In [13]:
def get_embed(sentence, tokenizer, model):
    # Tokenize the sentences
    tokens = tokenizer.tokenize(sentence, truncation=True)
    # Convert tokens to input IDs
    input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)  # Batch size 1
    # Obtain the model embeddings
    with torch.no_grad():
        outputs = model(input_ids)
        return outputs.last_hidden_state[:, 0, :]  # [CLS] token

In [14]:
def get_acc(q_embed, art_embed, no_samples = 1000, cosine = True):
    correct=0
    pred_ans=[]
    for i in tqdm(range(no_samples)):#len(q_embed))):
        pa=find_match_for_query(q_embed[i],art_embed, cosine)
        if pa == i:
            correct+=1
        pred_ans.append(pa)
        if i%100==0:
            print("accuracy for "+str(i)+" samples",(correct/len(pred_ans))*100)
    return (correct/len(pred_ans))*100, pred_ans

## The twin models

Tested on the first 1000 samples of the validation dataset and got 78% accuracy

In [28]:
model_Q = SentenceTransformer('flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-Q')
model_A = SentenceTransformer('flax-sentence-embeddings/multi-QA_v1-mpnet-asymmetric-A')

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [29]:
model_A.encode(df[article_col][0]).shape,model_Q.encode(df[query_col][0]).shape

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

((768,), (768,))

In [None]:
art_embed=model_A.encode(df[article_col])
q_embed=model_Q.encode(df[query_col])

In [38]:
print(len(art_embed), len(q_embed))
print("accuracy",get_acc(q_embed, art_embed)[0])

11535 11535


  0%|          | 1/1000 [00:01<20:22,  1.22s/it]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [01:58<17:27,  1.17s/it]

accuracy for 100 samples 76.23762376237624


 13%|█▎        | 127/1000 [02:29<17:07,  1.18s/it]


KeyboardInterrupt: 

## distilbert

Tested on the first 1000 samples of the validation dataset and got 72.36% accuracy

In [40]:
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-dot-prod-v3')


In [41]:
embed = model.encode(df[query_col][0])
print(embed.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)


In [None]:
art_embed=model.encode(df[article_col])
q_embed=model.encode(df[query_col])

In [33]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 1/1000 [00:00<09:18,  1.79it/s]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [00:54<08:10,  1.83it/s]

accuracy for 100 samples 66.33663366336634


 20%|██        | 201/1000 [01:48<06:46,  1.97it/s]

accuracy for 200 samples 63.681592039801


 30%|███       | 301/1000 [02:42<06:10,  1.89it/s]

accuracy for 300 samples 60.46511627906976


 40%|████      | 401/1000 [03:35<05:25,  1.84it/s]

accuracy for 400 samples 62.593516209476306


 50%|█████     | 501/1000 [04:29<04:25,  1.88it/s]

accuracy for 500 samples 61.876247504990026


 60%|██████    | 601/1000 [05:24<03:25,  1.94it/s]

accuracy for 600 samples 63.394342762063225


 70%|███████   | 701/1000 [06:18<02:52,  1.74it/s]

accuracy for 700 samples 64.76462196861627


 80%|████████  | 801/1000 [07:11<01:42,  1.95it/s]

accuracy for 800 samples 65.04369538077404


 90%|█████████ | 901/1000 [08:07<00:52,  1.87it/s]

accuracy for 900 samples 64.26193118756936


100%|██████████| 1000/1000 [09:01<00:00,  1.85it/s]

accuracy 64.0





## roberta

In [42]:
model = SentenceTransformer('sentence-transformers/msmarco-roberta-base-ance-firstp')

In [None]:
#for i in tqdm(range(len(df[article_col]))):
art_embed=model.encode(df[article_col])
q_embed=model.encode(df[query_col])

In [43]:
embed = model.encode(df[query_col][0])
print(embed.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)


In [44]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 1/1000 [00:00<08:47,  1.90it/s]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [00:51<07:49,  1.92it/s]

accuracy for 100 samples 70.29702970297029


 20%|██        | 201/1000 [01:41<06:35,  2.02it/s]

accuracy for 200 samples 72.13930348258707


 30%|███       | 301/1000 [02:31<05:40,  2.05it/s]

accuracy for 300 samples 71.09634551495017


 40%|████      | 401/1000 [03:22<04:51,  2.06it/s]

accuracy for 400 samples 71.571072319202


 50%|█████     | 501/1000 [04:13<04:04,  2.04it/s]

accuracy for 500 samples 70.85828343313374


 60%|██████    | 601/1000 [05:03<03:16,  2.03it/s]

accuracy for 600 samples 71.54742096505824


 70%|███████   | 701/1000 [05:54<02:30,  1.99it/s]

accuracy for 700 samples 72.89586305278173


 80%|████████  | 801/1000 [06:45<01:40,  1.99it/s]

accuracy for 800 samples 72.40948813982521


 90%|█████████ | 901/1000 [07:36<00:49,  2.02it/s]

accuracy for 900 samples 72.36403995560488


100%|██████████| 1000/1000 [08:26<00:00,  1.97it/s]

accuracy 72.39999999999999





## DistilBert again!

In [34]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")
model = AutoModel.from_pretrained("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")

tokenizer_config.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/262k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [35]:
get_embed(df[article_col][0], tokenizer, model).shape

torch.Size([1, 768])

In [27]:
from tqdm import tqdm
art_embed=[]
q_embed=[]
for i in tqdm(range(len(df[article_col]))):
    art_embed.append(get_embed(df[article_col][i], tokenizer, model))
    q_embed.append(get_embed(df[query_col][i], tokenizer, model))

100%|██████████| 11535/11535 [59:00<00:00,  3.26it/s] 


In [None]:
print(len(art_embed),len(q_embed))

In [31]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 2/1000 [00:00<03:20,  4.99it/s]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [00:22<04:41,  3.20it/s]

accuracy for 100 samples 45.54455445544555


 20%|██        | 202/1000 [00:44<02:39,  5.00it/s]

accuracy for 200 samples 43.78109452736319


 30%|███       | 302/1000 [01:05<02:20,  4.98it/s]

accuracy for 300 samples 44.85049833887043


 40%|████      | 402/1000 [01:27<02:00,  4.96it/s]

accuracy for 400 samples 45.137157107231914


 50%|█████     | 501/1000 [01:49<01:46,  4.70it/s]

accuracy for 500 samples 44.31137724550898


 60%|██████    | 602/1000 [02:11<01:19,  4.99it/s]

accuracy for 600 samples 43.261231281198


 70%|███████   | 702/1000 [02:32<00:55,  5.34it/s]

accuracy for 700 samples 43.794579172610554


 80%|████████  | 801/1000 [02:52<00:38,  5.22it/s]

accuracy for 800 samples 44.694132334581774


 90%|█████████ | 902/1000 [03:14<00:19,  5.10it/s]

accuracy for 900 samples 43.84017758046615


100%|██████████| 1000/1000 [03:34<00:00,  4.66it/s]

accuracy 44.1





## Siamese-BERT

In [30]:
#Load the model
model = SentenceTransformer('SeyedAli/Multilingual-Text-Semantic-Search-Siamese-BERT-V1')

In [32]:
#Encode query and documents
q_embed = model.encode(df[query_col])
art_embed = model.encode(df[article_col])

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [33]:
print(q_embed[0].shape)

(384,)


In [15]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 1/1000 [00:00<08:55,  1.86it/s]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [00:52<07:43,  1.94it/s]

accuracy for 100 samples 69.3069306930693


 20%|██        | 201/1000 [01:43<07:49,  1.70it/s]

accuracy for 200 samples 70.1492537313433


 30%|███       | 301/1000 [02:35<05:50,  2.00it/s]

accuracy for 300 samples 68.10631229235881


 40%|████      | 401/1000 [03:26<05:00,  1.99it/s]

accuracy for 400 samples 67.83042394014963


 50%|█████     | 501/1000 [04:17<04:19,  1.92it/s]

accuracy for 500 samples 67.66467065868264


 60%|██████    | 601/1000 [05:08<03:20,  1.99it/s]

accuracy for 600 samples 68.88519134775375


 70%|███████   | 701/1000 [06:00<02:40,  1.86it/s]

accuracy for 700 samples 69.75748930099857


 80%|████████  | 801/1000 [06:52<01:51,  1.78it/s]

accuracy for 800 samples 70.53682896379524


 90%|█████████ | 901/1000 [07:43<00:49,  2.01it/s]

accuracy for 900 samples 70.9211986681465


100%|██████████| 1000/1000 [08:36<00:00,  1.94it/s]

accuracy 71.39999999999999





## e5

In [21]:
model = SentenceTransformer('intfloat/e5-base-v2')

In [None]:
#Encode query and documents
q_embed = model.encode(df[query_col])
art_embed = model.encode(df[article_col])

In [22]:
embed = model.encode(df[query_col][0])
print(embed.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)


In [17]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 1/1000 [00:00<08:32,  1.95it/s]

accuracy for 0 samples 100.0


 10%|█         | 101/1000 [00:52<07:41,  1.95it/s]

accuracy for 100 samples 88.11881188118812


 20%|██        | 201/1000 [01:44<06:55,  1.92it/s]

accuracy for 200 samples 87.06467661691542


 30%|███       | 301/1000 [02:36<05:57,  1.96it/s]

accuracy for 300 samples 85.04983388704319


 40%|████      | 401/1000 [03:28<05:09,  1.94it/s]

accuracy for 400 samples 84.78802992518703


 50%|█████     | 501/1000 [04:21<04:59,  1.67it/s]

accuracy for 500 samples 84.83033932135729


 60%|██████    | 601/1000 [05:13<03:30,  1.90it/s]

accuracy for 600 samples 85.19134775374376


 70%|███████   | 701/1000 [06:07<02:35,  1.92it/s]

accuracy for 700 samples 86.01997146932952


 80%|████████  | 801/1000 [07:00<01:47,  1.85it/s]

accuracy for 800 samples 85.39325842696628


 90%|█████████ | 901/1000 [07:51<00:50,  1.95it/s]

accuracy for 900 samples 85.3496115427303


100%|██████████| 1000/1000 [08:43<00:00,  1.91it/s]

accuracy 85.3





In [18]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = True)[0])

  0%|          | 1/1000 [00:01<20:08,  1.21s/it]

accuracy for 0 samples 100.0


 10%|█         | 101/1000 [01:56<18:02,  1.20s/it]

accuracy for 100 samples 88.11881188118812


 20%|██        | 201/1000 [03:51<15:14,  1.14s/it]

accuracy for 200 samples 87.06467661691542


 24%|██▎       | 236/1000 [04:32<14:42,  1.15s/it]


KeyboardInterrupt: 

## distiluse

In [23]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

In [24]:
q_embed = model.encode(df[query_col])
art_embed = model.encode(df[article_col])

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

In [25]:
print(q_embed[0].shape)

(512,)


In [26]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = False)[0])

  0%|          | 1/1000 [00:00<09:09,  1.82it/s]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [00:52<07:48,  1.92it/s]

accuracy for 100 samples 45.54455445544555


 20%|██        | 201/1000 [01:45<08:40,  1.54it/s]

accuracy for 200 samples 45.27363184079602


 30%|███       | 301/1000 [02:38<05:53,  1.98it/s]

accuracy for 300 samples 48.837209302325576


 40%|████      | 401/1000 [03:31<05:10,  1.93it/s]

accuracy for 400 samples 50.12468827930174


 50%|█████     | 501/1000 [04:23<04:16,  1.95it/s]

accuracy for 500 samples 49.50099800399202


 60%|██████    | 601/1000 [05:16<03:35,  1.85it/s]

accuracy for 600 samples 49.58402662229617


 70%|███████   | 701/1000 [06:10<02:36,  1.91it/s]

accuracy for 700 samples 50.21398002853067


 80%|████████  | 801/1000 [07:04<01:40,  1.98it/s]

accuracy for 800 samples 50.686641697877654


 90%|█████████ | 901/1000 [07:57<00:52,  1.87it/s]

accuracy for 900 samples 50.72142064372919


100%|██████████| 1000/1000 [08:49<00:00,  1.89it/s]

accuracy 50.6





In [27]:
print("accuracy",get_acc(q_embed, art_embed, no_samples = 1000, cosine = True)[0])

  0%|          | 1/1000 [00:01<19:32,  1.17s/it]

accuracy for 0 samples 0.0


 10%|█         | 101/1000 [02:00<18:27,  1.23s/it]

accuracy for 100 samples 48.51485148514851


 20%|██        | 201/1000 [03:59<15:56,  1.20s/it]

accuracy for 200 samples 49.75124378109453


 30%|███       | 301/1000 [05:59<13:33,  1.16s/it]

accuracy for 300 samples 52.823920265780735


 40%|████      | 401/1000 [07:58<12:02,  1.21s/it]

accuracy for 400 samples 53.36658354114713


 50%|█████     | 501/1000 [09:58<09:53,  1.19s/it]

accuracy for 500 samples 53.093812375249506


 60%|██████    | 601/1000 [11:56<08:06,  1.22s/it]

accuracy for 600 samples 53.244592346089846


 70%|███████   | 701/1000 [13:55<05:45,  1.16s/it]

accuracy for 700 samples 53.78031383737518


 80%|████████  | 801/1000 [15:54<04:00,  1.21s/it]

accuracy for 800 samples 54.30711610486891


 90%|█████████ | 901/1000 [17:51<01:52,  1.14s/it]

accuracy for 900 samples 54.05105438401776


100%|██████████| 1000/1000 [19:47<00:00,  1.19s/it]

accuracy 54.2



