## Installation and Importing

In [1]:
!pip install sentence-transformers
!pip install faiss-gpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [22]:
import pandas as pd
from datasets import load_dataset
import faiss
import torch
#from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sentence_transformers.util import dot_score, cos_sim
from sentence_transformers import SentenceTransformer
import numpy as np
from pprint import pprint
import time

In [5]:
#set those vars according to your dataset
query_col = "title"
article_col = "text"

In [3]:
#use if your dataset is in a csv file
#data_path = "" 
#df = pd.read_csv(data_path)
#also split your dataset into train, val, test
#train_df, val_df, test_df = \
#              np.split(df.sample(frac=1, random_state=42), 
#                       [int(.6*len(df)), int(.8*len(df))])

df = load_dataset("csebuetnlp/xlsum", "english")
print(df.keys())
train_df = pd.DataFrame(df["train"])
train_df.drop_duplicates(inplace= True)
val_df = pd.DataFrame(df["validation"])
val_df.drop_duplicates(inplace= True)
test_df = pd.DataFrame(df["test"])
test_df.drop_duplicates(inplace= True)

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/264M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306522 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11535 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11535 [00:00<?, ? examples/s]

dict_keys(['train', 'test', 'validation'])


In [None]:
print(test_df.shape)

In [12]:
model = SentenceTransformer('SeyedAli/Multilingual-Text-Semantic-Search-Siamese-BERT-V1')
out_shape=model.encode(test_df[article_col][0]).shape
print(out_shape[0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

384


In [16]:
encoded_data = model.encode(test_df[article_col].tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(out_shape[0]))

Batches:   0%|          | 0/361 [00:00<?, ?it/s]

In [17]:
index.add_with_ids(encoded_data, np.array(range(0, len(test_df))))
faiss.write_index(index, 'articles.index')

In [25]:
def fetch_article(dataframe_idx, df = test_df):
    info = df.iloc[dataframe_idx]
    meta_dict={'article' : info[article_col]}
    return meta_dict
    
def search(query, top_k, index, model):
    t=time.time()
    query_vector = model.encode([query])
    top_k = index.search(query_vector, top_k)
    print('>>>> Results in Total Time: {}'.format(time.time()-t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    results =  [fetch_article(idx) for idx in top_k_ids]
    return results

In [28]:
query="Gaza"
results=search(query, top_k=5, index=index, model=model)
print("\n")
for result in results:
    print('\t',result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

>>>> Results in Total Time: 0.03978729248046875


	 {'article': 'Such a move would violate international law and leave what would amount to "a Palestinian Bantustan", they warned. Prime Minister Benjamin Netanyahu says in July he may start the process of applying Israeli sovereignty to Jewish settlements and the Jordan Valley. Such a move was effectively green-lighted by Donald Trump\'s peace plan. Mr Trump\'s Vision for Peace, released in January, also envisages a Palestinian state in the remaining 70% of the West Bank, all of Gaza, and with its capital on the fringes of East Jerusalem. The Palestinians - who claim all of the West Bank, Gaza and East Jerusalem - have dismissed the plan as biased towards Israel and a denial of their rights. Israel has occupied the West Bank and East Jerusalem since the 1967 Middle East war. It pulled its troops and settlers out of Gaza in 2005, but the UN says its occupation there has not ended. More than 600,000 Jews live in about 140 settlements in t