# Setup

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!python -m nltk.downloader stopwords
!python -m nltk.downloader universal_tagset
!python -m spacy download en
!python -m nltk.downloader punkt

In [38]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pke
import string
import nltk
from nltk.corpus import stopwords
import spacy
import time

nlp = spacy.load("en")

LANGUAGE_CODES = {
    'it': 'italian',
    'en': 'english',
    'de': 'german',
    'es': 'spanish',
    'fr': 'french',
    'nl': 'dutch'
}

PART_OF_SPEECH = {'NOUN', 'ADJ', 'ADV', 'PROPN'}

STOPLIST = list(string.punctuation) + ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# Graph-based algorithms

In [39]:
def getTextRankSequences(text, language, n, window, top_percent, pos):
    extractor = pke.unsupervised.TextRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')
    extractor.candidate_selection(pos=pos)
    extractor.candidate_weighting(
        window=window, pos=pos, top_percent=top_percent)

    return extractor.get_n_best(n, stemming=False)


def getTopicRankSequences(text, language, n, window, threshold, pos):
    extractor = pke.unsupervised.TopicRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')

    stoplist = STOPLIST + stopwords.words(LANGUAGE_CODES[language])

    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting(threshold=threshold, method='average')

    return extractor.get_n_best(n, stemming=False)


def getMultipartiteRankSequences(text, language, n, alpha, threshold, pos):
    extractor = pke.unsupervised.MultipartiteRank()

    extractor.load_document(input=text, language=language,
                            normalization='lemmatization')

    stoplist = STOPLIST + stopwords.words(LANGUAGE_CODES[language])

    extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_weighting(alpha=alpha,
                              threshold=threshold,
                              method='average')
    
    return extractor.get_n_best(n, stemming=False)


def getPositionRankSequences(text, language, n, window, maximum_word_number):
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    grammar = "NP: {<ADJ>*<NOUN|PROPN|ADV>+}"
    
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(input=text,
                        language=language,
                        normalization=None)
    
    extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=maximum_word_number)
    
    extractor.candidate_weighting(window=window,
                              pos=pos)
    
    return extractor.get_n_best(n)


# Get extracted sentences based on keyphrases

In [40]:
def printExtractedSentences(results):
    keyphrases_list = []
    longest_lenght = max([len(record[0]) for record in results])

    for record in results:
        keyphrase = record[0]
        keyphrases_list.append(keyphrase)
        score = str(record[1])
        print(' '.join([keyphrase.ljust(longest_lenght), score]))

    extracted_sentences = []

    for keyphrase in keyphrases_list:
        for i in range(0, len(sentences_list)):
            if keyphrase.strip().lower() in sentences_list[i].strip().lower():
                extracted_sentences.append(" ".join(sentences_list[i].lower().split()))

    unique_sentences = set(extracted_sentences)

    return keyphrases_list, unique_sentences

# Print found keyphrases in bold within the entire text

In [41]:
from IPython.display import Markdown, display
import re


def printmd(string):
    display(Markdown(string))
    
def boldify_text(text, keyphrases_list):
    
    regex = re.compile(r'\b(?:%s)\b' % '|'.join(keyphrases_list), re.I)
    i = 0; output = ""
    for m in regex.finditer(text):
        output += "".join([text[i:m.start()],
                        "***",
                        text[m.start():m.end()],
                        "***"])
        i = m.end()
    printmd("".join([output, text[m.end():]]))

# Text to analyze

In [42]:
text = """I've had this product since 2015. The trimmer works really good, but unfortunately the battery life is terrible. After about a year of use, I have to expedite my shaves before the trimmer completely dies and needs to be recharged. Expensive product for a terrible battery life.
"""

# From text to sentences

In [43]:
print("*** Sentences ***\n")
sentences_list = nltk.tokenize.sent_tokenize(text)
for i in range(0, len(sentences_list)):
    print(str(i) + ") " + " ".join(sentences_list[i].split()) + "\n")

*** Sentences ***

0) I've had this product since 2015.

1) The trimmer works really good, but unfortunately the battery life is terrible.

2) After about a year of use, I have to expedite my shaves before the trimmer completely dies and needs to be recharged.

3) Expensive product for a terrible battery life.



# TextRank results

In [44]:
# start_time = time.time()
results = getTextRankSequences(text=text, language=language, n=10, window=2, top_percent=0.33, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)



terrible battery life 0.3076928076923066
battery life          0.22884063489187062
expensive product     0.20512866512820452
trimmer completely    0.2051285751282045
really good           0.2051283151282045
battery               0.14998879209143462
really                0.10256421256410225
trimmer               0.10256419256410225
product               0.10256414256410226
terrible              0.078852042800436


I've had this ***product*** since 2015. The ***trimmer*** works ***really good***, but unfortunately the ***battery life*** is ***terrible***. After about a year of use, I have to expedite my shaves before the ***trimmer completely*** dies and needs to be recharged. ***Expensive product*** for a ***terrible battery life***.


# TopicRank results

In [45]:
# start_time = time.time()
results = getTopicRankSequences(text=text, language=language, n=20, window=2, threshold=0.74, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)



battery life  0.1608210679182538
trimmer       0.14673713959490473
unfortunately 0.10909229455725217
really good   0.1064066192556333
product       0.10473214017909888
terrible      0.10323611091810328
year          0.09863549423437384
use           0.09456798654033943
shaves        0.07577114680204071


I've had this ***product*** since 2015. The ***trimmer*** works ***really good***, but ***unfortunately*** the ***battery life*** is ***terrible***. After about a ***year*** of ***use***, I have to expedite my ***shaves*** before the ***trimmer*** completely dies and needs to be recharged. Expensive ***product*** for a ***terrible*** ***battery life***.


# MultipartiteRank results

In [46]:
# start_time = time.time()
results = getMultipartiteRankSequences(text, language, n=20, alpha=1.1, threshold=0.74, pos=PART_OF_SPEECH)
# print("--- %s seconds ---" % (time.time() - start_time)) 
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)



battery life          0.13382402384289527
trimmer               0.13069562451666528
really good           0.11172057829114589
unfortunately         0.10451962914987453
product               0.0934937062615649
terrible              0.0923348949694817
year                  0.07723052668069205
use                   0.07272886185178323
shaves                0.05633525933640143
trimmer completely    0.04830661093259242
expensive product     0.04110309103365303
terrible battery life 0.03770719313325012


I've had this ***product*** since 2015. The ***trimmer*** works ***really good***, but ***unfortunately*** the ***battery life*** is ***terrible***. After about a ***year*** of ***use***, I have to expedite my ***shaves*** before the ***trimmer*** completely dies and needs to be recharged. ***Expensive product*** for a ***terrible*** ***battery life***.


# PositionRank results

In [52]:
results = getPositionRankSequences(text, language, n=20, window=10, maximum_word_number=3)
keyphrases_list, unique_sentences = printExtractedSentences(results)
boldify_text(text, keyphrases_list)



terrible battery life 0.3284325259389387
battery life          0.2228940605993519
expensive product     0.15171640096447214
trimmer completely    0.13198413940248171
product               0.09834709482559376
trimmer               0.09354137349647822
really                0.07120953826270209
unfortunately         0.06553908411018662
about                 0.05146932391470334
year                  0.050773666358921894
use                   0.04264961880457311
shaves                0.035895794927204114


I've had this ***product*** since 2015. The ***trimmer*** works ***really*** good, but ***unfortunately*** the ***battery life*** is terrible. After ***about*** a ***year*** of ***use***, I have to expedite my ***shaves*** before the ***trimmer completely*** dies and needs to be recharged. ***Expensive product*** for a ***terrible battery life***.
