## Imports

In [6]:
from keybert import KeyBERT
#!pip install spacy
import spacy
#!python -m spacy download en_core_web_trf
!pip install spacy-transformers



## Model

In [2]:
model = KeyBERT(model="distilbert-base-nli-mean-tokens")

## Input

In [2]:
doc1 = "Yoga mats (also called sticky mats) are used in most yoga classes to provide cushioning and traction. While you can usually rent a mat at a ..."
doc2 = "Plan for retirement, learn how to invest, and more. Access our investor education resources to get started or further develop your investing and trading strategies."
doc3 = "Computer Desk With Hutch And Bookshelf, 47 Inches Home Office Desk With Space Saving Design For Small Spaces (Dark Walnut) Inbox Zero Color"
doc4 = "This is a list of sentences. Do some have rethoric questions? I don't know... This is a sentence about yoga mats though!"

## Output

In [4]:
output1 = model.extract_keywords(doc1, top_n=10, keyphrase_ngram_range=(1, 2), use_maxsum = False, stop_words="english", use_mmr=False, diversity=0.9)
output1

[('yoga mats', 0.5897),
 ('yoga classes', 0.5886),
 ('used yoga', 0.4376),
 ('yoga', 0.4116),
 ('sticky mats', 0.3777),
 ('mats used', 0.3537),
 ('cushioning traction', 0.2746),
 ('classes provide', 0.2717),
 ('provide cushioning', 0.2704),
 ('traction usually', 0.2512)]

In [5]:
output2 = model.extract_keywords(doc2, top_n=10, keyphrase_ngram_range=(1, 2), stop_words="english")
output2

[('investor education', 0.5795),
 ('develop investing', 0.5221),
 ('investing trading', 0.5068),
 ('plan retirement', 0.4869),
 ('retirement learn', 0.4866),
 ('trading strategies', 0.4255),
 ('learn invest', 0.4131),
 ('investing', 0.382),
 ('access investor', 0.3806),
 ('education resources', 0.3363)]

In [6]:
output3 = model.extract_keywords(doc3, top_n=10, keyphrase_ngram_range=(1, 2), stop_words="english")
output3

[('computer desk', 0.5546),
 ('office desk', 0.5493),
 ('desk hutch', 0.4791),
 ('desk space', 0.4788),
 ('desk', 0.4281),
 ('walnut inbox', 0.4236),
 ('dark walnut', 0.4195),
 ('hutch bookshelf', 0.3808),
 ('bookshelf', 0.3786),
 ('bookshelf 47', 0.378)]

## Keybert & spaCy

In [9]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
nlp.add_pipe('sentencizer')

kw_model = KeyBERT(model=nlp)
kw_model.extract_keywords(doc1, top_n=10, keyphrase_ngram_range=(1, 2))

[('classes provide', 0.8944),
 ('sticky mats', 0.8909),
 ('rent mat', 0.8902),
 ('traction', 0.888),
 ('called', 0.8874),
 ('used', 0.8871),
 ('provide cushioning', 0.8866),
 ('called sticky', 0.8864),
 ('yoga mats', 0.8862),
 ('mats used', 0.8848)]

## Filtering out just 1 sentence

In [14]:
nlp_phrases

[This is a list of sentences.,
 Do some have rethoric questions?,
 I don't know... This is a sentence about yoga mats though!]

In [36]:
MAX_CHARS = 300 #officially its up to 320


def choose_best_sentence(text):
    nlp_phrases =[sent for sent in nlp(text).sents]
    phrases = [str(sent) for sent in nlp_phrases]
    longest_sent = max(nlp_phrases, key=len)
    #for sent in nlp_phrases:
        #print(len(sent.text))
    return longest_sent

In [37]:
choose_best_sentence(doc4)

I don't know... This is a sentence about yoga mats though!