In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip3 install -U sentence-transformers
!pip install faiss-gpu

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 11.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 31.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.1 MB/s 
Colle

In [3]:
import torch
import faiss
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
import pickle
import io
import os
import numpy
from sklearn.metrics import accuracy_score
import re
import time

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

class SBERT_XL_Sum() : 
    def __init__(self, dataset_name, save=False, saved_embeddings_path = None):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.data = self.load_dataset(dataset_name)
        if saved_embeddings_path == None:
            self.model = SentenceTransformer('distiluse-base-multilingual-cased')
            self.model.max_seq_length = self.model.max_seq_length 
            self.data_embeddings = self.compute_text_embeddings()
            if save==True:
              with open(dataset_name+'_embeddings.pkl', "wb") as fOut:
                pickle.dump(self.data_embeddings, fOut, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            if self.device == "cpu":
                with open(saved_embeddings_path, "rb") as f:
                    self.data_embeddings = CPU_Unpickler(f).load()
            else:
                self.data_embeddings = pd.read_pickle(saved_embeddings_path) 

        self.results = self.test_model()

    def load_dataset(self,dataset_name):
        dataset = pd.read_csv('/content/gdrive/MyDrive/Dataset/'+dataset_name+'_dataset.csv')
        return dataset
    
    def split_sentence(self,sentence):
        if len(sentence.split()) > 200:
            s = []
            s.append(' '.join(sentence.split()[:200]))
            s.append(' '.join(self.split_sentence(' '.join(sentence.split()[200:]))))
            return s
        else: 
            return [sentence]

    def compute_text_embeddings(self):
        embeddings = []
        for id, row in tqdm(self.data.iterrows(), desc="compute Embeddings: "):
            summary_sentences = [row.title]
            text_sentences = [row.title]
            for sentence in sent_tokenize(row.summary):
                s = self.split_sentence(sentence)
                summary_sentences.extend(s)
            summary_embeddings = self.model.encode(summary_sentences, convert_to_tensor=True)
            for sentence in sent_tokenize(row.text):
                s = self.split_sentence(sentence)
                text_sentences .extend(s)
            text_embeddings = self.model.encode(text_sentences, convert_to_tensor=True)
            embeddings.append({"title": row.title, "lang": row.lang, "summary_embedding": torch.mean(summary_embeddings, dim=0), "text_embedding": torch.mean(text_embeddings, dim=0)}) 
        return embeddings

    def faiss_index(self,query_idx,query_lang, query, k):
        d = len(self.data_embeddings[query_idx]['text_embedding']) #embedding's size 
        n = len(self.data_embeddings) #number of articles
        other_articles_embeddings = [item['summary_embedding'] for item in self.data_embeddings[:query_idx] if item['lang']==query_lang] + [item['summary_embedding'] for item in self.data_embeddings[query_idx+1:] if item['lang']==query_lang]
        document_embeddings = numpy.array([numpy.array(x.cpu()) for x in other_articles_embeddings])
        index = faiss.IndexFlatL2(d)   # build the index, d=size of vectors 
        index.add(document_embeddings)                  
        D, I = index.search(query, k) 
        I_titles = [] 
        for i in I[0]:
            I_titles.append({"title":list(self.data.title)[i], "summary": list(self.data.summary)[i]})
        return(I_titles)

    def find_similar_summaries(self, title, num_items):
        idx = list(self.data.title).index(title)
        query_embed = self.data_embeddings[idx]['text_embedding'].cpu().numpy().reshape(1,len(self.data_embeddings[idx]['text_embedding']))
        query_lang = self.data_embeddings[idx]['lang']
        similar_docs = self.faiss_index(idx, query_lang=query_lang, query=query_embed, k=num_items)
        return similar_docs

    def test_model(self, k=10):
        model_labels = {}
        for title in tqdm(list(self.data.title), desc="Find k = {} Similar articles".format(k)):
            model_labels[title] = self.find_similar_summaries(title, k)
        return model_labels


In [None]:
#test_sbert = SBERT_XL_Sum(dataset_name="test", save=True)
test_sbert = SBERT_XL_Sum(dataset_name="test", saved_embeddings_path='test_embeddings.pkl')

Find k = 10 Similar articles:  10%|▉         | 3960/40461 [21:06<4:04:47,  2.49it/s]

In [None]:
test_sbert.results