In [14]:
#!pip install transformers
#!pip install sklearn
#!pip install nltk

#import nltk
#nltk.download('stopwords')

#!pip install sentence-transformers  

Collecting sentence-transformers
  Downloading sentence-transformers-1.0.4.tar.gz (74 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.95-cp39-cp39-win_amd64.whl (1.2 MB)
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-1.0.4-py3-none-any.whl size=114306 sha256=b8975eb469d5b70996b8e41c731d6071cc26ec35fe23e39027e495d0f8afd531
  Stored in directory: c:\users\eduardo\appdata\local\pip\cache\wheels\e7\fc\35\51d4c35428e8770140d2fede607f9e6cf1cd3799d748b1168b
Successfully built sentence-transformers
Installing collected packages: sentencepiece, sentence-transformers
Successfully installed sentence-transformers-1.0.4 sentencepiece-0.1.95


In [28]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [34]:
import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [3]:
doc ="""Hallazgos: 

Mamas heterogéneamente densas. 
En la mama derecha, cuadrante supero externo, tercio posterior se observa parcialmente algunos contornos nodulares y a izquierda, también algunos nodulitos en cuadrantes inferiores, tercio posterior, los cuales no eran visibles en examen previo. 
Calcificaciones benignas en regiones axilares. 
Impresión:  
Mamas densas con nódulos bilaterales. 

BIRADS 0."""

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

n_gram_range = (3, 3)
stop_words = stopwords.words('spanish')

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [24]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [31]:
keywords


['axilares impresión mamas',
 'mamas densas nódulos',
 'mama derecha cuadrante',
 'mamas heterogéneamente densas',
 'hallazgos mamas heterogéneamente']

In [32]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

['nodulares izquierda nodulitos',
 'izquierda nodulitos cuadrantes',
 'heterogéneamente densas mama',
 'mamas densas nódulos',
 'hallazgos mamas heterogéneamente']

In [33]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=20)

['nódulos bilaterales birads',
 'previo calcificaciones benignas',
 'externo tercio posterior',
 'axilares impresión mamas',
 'mamas heterogéneamente densas']

In [45]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.5)

['hallazgos mamas heterogéneamente',
 'cuales visibles examen',
 'posterior observa parcialmente',
 'izquierda nodulitos cuadrantes',
 'mama derecha cuadrante']

In [2]:
!pip install keybert



Collecting keybert
  Downloading keybert-0.2.0.tar.gz (12 kB)
Building wheels for collected packages: keybert
  Building wheel for keybert (setup.py): started
  Building wheel for keybert (setup.py): finished with status 'done'
  Created wheel for keybert: filename=keybert-0.2.0-py3-none-any.whl size=10611 sha256=db19aef9a57aba11a8afa011e77e243d2f44ec1127adcdef898404006e7cf14c
  Stored in directory: c:\users\eduardo\appdata\local\pip\cache\wheels\2b\5e\cb\9bedeed618085f255420717d2960da9704821c46a9ffc1c3c3
Successfully built keybert
Installing collected packages: keybert
Successfully installed keybert-0.2.0


In [42]:
from keybert import KeyBERT

doc = """
         Parénquima mamario denso, heterogéneo y pseudonodular, lo que disminuye la sensibilidad del método.
En la unión de los cuadrantes inferiores, tercio posterior de la mama derecha, se visualiza nódulo isodenso, de bordes aceptablemente definidos, de 6mm., reducido de tamaño entre controles.
No hay lesiones espiculadas, distorsiones ni microcalcificaciones agrupadas de sospecha, solo puntiformes aisladas. 
      """
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc)

In [58]:
model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=stopwords.words('spanish'))

[('parénquima', 0.57),
 ('cuadrantes', 0.5348),
 ('aceptablemente', 0.5328),
 ('microcalcificaciones', 0.5078),
 ('agrupadas', 0.5056)]

In [44]:
model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=stopwords.words('spanish'))

[('aceptablemente definidos', 0.6677),
 ('lesiones espiculadas', 0.6509),
 ('parénquima mamario', 0.6357),
 ('agrupadas sospecha', 0.6142),
 ('bordes aceptablemente', 0.6126)]

In [45]:
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words=stopwords.words('spanish'), 
                           use_maxsum=True, nr_candidates=20, top_n=5)

[('mamario denso heterogéneo', 0.7187),
 ('mama derecha visualiza', 0.6128),
 ('nódulo isodenso bordes', 0.6502),
 ('espiculadas distorsiones microcalcificaciones', 0.6667),
 ('definidos 6mm reducido', 0.5857)]

In [46]:
model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words=stopwords.words('spanish'), 
                           use_mmr=True, diversity=0.7)

[('bordes aceptablemente definidos', 0.7187),
 ('solo puntiformes aisladas', 0.5748),
 ('6mm reducido tamaño', 0.6352),
 ('heterogéneo pseudonodular disminuye', 0.5857),
 ('posterior mama derecha', 0.6083)]

In [59]:
model.extract_keywords(doc, keyphrase_ngram_range=(6, 6), use_mmr=True, diversity=0.2,nr_candidates=20, top_n=5,stop_words=None

[('hay lesiones espiculadas distorsiones ni microcalcificaciones', 0.8135),
 ('bordes aceptablemente definidos de 6mm reducido', 0.8039),
 ('mamario denso heterogéneo pseudonodular lo que', 0.7988),
 ('nódulo isodenso de bordes aceptablemente definidos', 0.7915),
 ('pseudonodular lo que disminuye la sensibilidad', 0.7902)]

In [60]:
doc

'\n         Parénquima mamario denso, heterogéneo y pseudonodular, lo que disminuye la sensibilidad del método.\nEn la unión de los cuadrantes inferiores, tercio posterior de la mama derecha, se visualiza nódulo isodenso, de bordes aceptablemente definidos, de 6mm., reducido de tamaño entre controles.\nNo hay lesiones espiculadas, distorsiones ni microcalcificaciones agrupadas de sospecha, solo puntiformes aisladas. \n      '