# Setup

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!python -m nltk.downloader stopwords
!python -m nltk.downloader universal_tagset
!python -m spacy download en
!python -m nltk.downloader punkt

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pke
import string
import nltk
from nltk.corpus import stopwords
import spacy
import time

nlp = spacy.load("en")

LANGUAGE_CODES = {
    'it': 'italian',
    'en': 'english',
    'de': 'german',
    'es': 'spanish',
    'fr': 'french',
    'nl': 'dutch'
}

PART_OF_SPEECH = {'NOUN', 'ADJ', 'ADV', 'PROPN'}

STOPLIST = list(string.punctuation) + ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# Graph-based algorithms

In [None]:
def getTextRankSequences(text, language, n, window, top_percent, pos):
    extractor = pke.unsupervised.TextRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')
    extractor.candidate_selection(pos=pos)
    extractor.candidate_weighting(
        window=window, pos=pos, top_percent=top_percent)

    return extractor.get_n_best(n, stemming=False)


def getTopicRankSequences(text, language, n, window, threshold, pos):
    extractor = pke.unsupervised.TopicRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')

    stoplist = STOPLIST + stopwords.words(LANGUAGE_CODES[language])

    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting(threshold=threshold, method='average')

    return extractor.get_n_best(n, stemming=False)


def getMultipartiteRankSequences(text, language, n, alpha, threshold, pos):
    extractor = pke.unsupervised.MultipartiteRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')

    stoplist = STOPLIST + stopwords.words(LANGUAGE_CODES[language])

    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting(alpha=alpha,
                              threshold=threshold,
                              method='average')
    
    return extractor.get_n_best(n, stemming=False)


def getPositionRankSequences(text, language, n, window, maximum_word_number):
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    grammar = "NP: {<ADJ>*<NOUN|PROPN|ADV>+}"
    
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(input=text,
                        language=language,
                        normalization=None)
    
    extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=maximum_word_number)
    
    extractor.candidate_weighting(window=window,
                              pos=pos)
    
    return extractor.get_n_best(n)


# Get extracted sentences based on keyphrases

In [None]:
def printExtractedSentences(results):
    keyphrases_list = []
    longest_lenght = max([len(record[0]) for record in results])

    for record in results:
        keyphrase = record[0]
        keyphrases_list.append(keyphrase)
        score = str(record[1])
        print(' '.join([keyphrase.ljust(longest_lenght), score]))

    extracted_sentences = []

    for keyphrase in keyphrases_list:
        for i in range(0, len(sentences_list)):
            if keyphrase.strip().lower() in sentences_list[i].strip().lower():
                extracted_sentences.append(" ".join(sentences_list[i].lower().split()))

    unique_sentences = set(extracted_sentences)

    return keyphrases_list, unique_sentences

# Print found keyphrases in bold within the entire text

In [None]:
from IPython.display import Markdown, display
import re


def printmd(string):
    display(Markdown(string))
    
def boldify_text(text, keyphrases_list):
    
    regex = re.compile(r'\b(?:%s)\b' % '|'.join(keyphrases_list), re.I)
    i = 0; output = ""
    for m in regex.finditer(text):
        output += "".join([text[i:m.start()],
                        "***",
                        text[m.start():m.end()],
                        "***"])
        i = m.end()
    printmd("".join([output, text[m.end():]]))

# Text to analyze

In [None]:
text = """I've had this product since 2015. The trimmer works really good, but unfortunately the battery life is terrible. After about a year of use, I have to expedite my shaves before the trimmer completely dies and needs to be recharged. Expensive product for a terrible battery life.
"""
language = 'en'

# From text to sentences

In [None]:
print("*** Sentences ***\n")
sentences_list = nltk.tokenize.sent_tokenize(text)
for i in range(0, len(sentences_list)):
    print(str(i) + ") " + " ".join(sentences_list[i].split()) + "\n")

# TextRank results

In [None]:
# start_time = time.time()
results = getTextRankSequences(text=text, language=language, n=10, window=2, top_percent=0.33, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)

# TopicRank results

In [None]:
# start_time = time.time()
results = getTopicRankSequences(text=text, language=language, n=20, window=2, threshold=0.74, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)

# MultipartiteRank results

In [None]:
# start_time = time.time()
results = getMultipartiteRankSequences(text, language, n=20, alpha=1.1, threshold=0.74, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)

# PositionRank results

In [None]:
# start_time = time.time()
results = getPositionRankSequences(text, language, n=20, window=10, maximum_word_number=3)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)