In [113]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from gensim.summarization import keywords
from keybert import KeyBERT
from rake_nltk import Rake
import yake
import json
import spacy
import pke
import textstat
import nltk

In [114]:
# 1. RAKE
def rake_extractor(text):
    """
    Uses Rake to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[:10]

# 2. YAKE
def yake_extractor(text):
    """
    Uses YAKE to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    keywords = yake.KeywordExtractor(lan="en", n=3, windowsSize=3, top=10).extract_keywords(text)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results 


# 3. PositionRank
def position_rank_extractor(text):
    """
    Uses PositionRank to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    # define the valid Part-of-Speeches to occur in the graph
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(text, language='en')
    extractor.candidate_selection(maximum_word_number=5)
    # 4. weight the candidates using the sum of their word's scores that are
    #    computed using random walk biaised with the position of the words
    #    in the document. In the graph, nodes are words (nouns and
    #    adjectives only) that are connected if they occur in a window of
    #    3 words.
    extractor.candidate_weighting(window=3, pos=pos)
    # 5. get the 5-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=10)
    results = []
    for scored_keywords in keyphrases:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results 

# 4. SingleRank
def single_rank_extractor(text):
    """
    Uses SingleRank to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    extractor = pke.unsupervised.SingleRank()
    extractor.load_document(text, language='en')
    extractor.candidate_selection(pos=pos)
    extractor.candidate_weighting(window=3, pos=pos)
    keyphrases = extractor.get_n_best(n=10)
    results = []
    for scored_keywords in keyphrases:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results 

# 5. MultipartiteRank
def multipartite_rank_extractor(text):
    """
    Uses MultipartiteRank to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    extractor = pke.unsupervised.MultipartiteRank()
    extractor.load_document(text, language='en')
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    extractor.candidate_selection(pos=pos)
    # 4. build the Multipartite graph and rank candidates using random walk,
    #    alpha controls the weight adjustment mechanism, see TopicRank for
    #    threshold/method parameters.
    extractor.candidate_weighting(alpha=1.1, threshold=0.74, method='average')
    keyphrases = extractor.get_n_best(n=10)
    results = []
    for scored_keywords in keyphrases:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results

# 6. TopicRank
def topic_rank_extractor(text):
    """
    Uses TopicRank to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    extractor = pke.unsupervised.TopicRank()
    extractor.load_document(text, language='en')
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    extractor.candidate_selection(pos=pos)
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=10)
    results = []
    for scored_keywords in keyphrases:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword) 
    return results

# 7. KeyBERT
def keybert_extractor(text):
    bert = KeyBERT()
    """
    Uses KeyBERT to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    keywords = bert.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words="english", top_n=10)
    results = []
    for scored_keywords in keywords:
        for keyword in scored_keywords:
            if isinstance(keyword, str):
                results.append(keyword)
    return results

In [115]:
class Filer():
    def __init__(self, inputf):
        self.inputf = inputf
    
    def write_file(self, data, outputf):
        with open(outputf, 'w') as f:
            json.dump(data, f)

    def read_file(self):
        with open(self.inputf) as f:
            data = json.load(f)
        return data

In [116]:
class Summarizer():
    def __init__(self, model):
        self.model = model
        self.summarizer = pipeline("summarization", model=self.model)
    
    def get_summary(self, text):
        words = text.split()
        totalwords = len(words)
        summary = self.summarizer(text, max_length = totalwords, do_sample=False)[0].get('summary_text')
        return summary

In [117]:
class KeywordExtractor():
    def __init__(self, model):
        self.model = model
        
    def get_keywords(self, text):
        if self.model == 'KEYBERT':
            head = {'KEYBERT': keybert_extractor(text)}
        elif self.model == 'TOPIC':
            head = {'TOPIC RANK': topic_rank_extractor(text)} 
        elif self.model == 'MULTIPARTITE':
            head = {'MULTIPARTITE RANK': multipartite_rank_extractor(text)}
        elif self.model == 'SINGLE':
            head = {'SINGLE RANK': single_rank_extractor(text)}
        elif self.model == 'YAKE':
            head = {'YAKE': yake_extractor(text)}
        elif self.model == 'RAKE':
            head = {'RAKE': rake_extractor(text)}
        elif self.model == 'POSITION':
            head = {'POSITION RANK': position_rank_extractor(text)}
        else:
            head = {}
            
        return head

In [118]:
def get_fulltext_summary(text, ex_summarizer):
    summary = ""
    for x in text.get('original').get('sections'):
        summary += ex_summarizer.get_summary(x)
    return summary

In [119]:
def process_data(json_data, summ, keyword):
    ex_keyword = KeywordExtractor(keyword)
    ex_summarizer = Summarizer(summ)
    data = []
    
    for i in json_data:
        s_abstract = ex_summarizer.get_summary(i.get('original').get('abstract'))
        s_full = get_fulltext_summary(i, ex_summarizer)
        s_article = ex_summarizer.get_summary(i.get('article').get('text'))
        head = {
            "abstract": {
                "title": i.get('original').get('title'),
                "url": i.get('original').get('url'),
                "abstract": i.get('original').get('abstract'),
                "tokens": len(nltk.word_tokenize(i.get('original').get('abstract'))),
            },
            "full_text": {
                "title": i.get('original').get('title'),
                "url": i.get('original').get('url'),
                "keywords": i.get('original').get('keywords'),
                "text": i.get('original').get('text'),
                "sections": i.get('original').get('sections'),
                "tokens": len(nltk.word_tokenize(i.get('original').get('text'))),
            },
            "article": {
                "title": i.get('article').get('title'),
                "url": i.get('article').get('url'),
                "keywords": i.get('article').get('keywords'),
                "text": i.get('article').get('text'),
                "tokens": len(nltk.word_tokenize(i.get('article').get('text'))),
            },
            "summaries": {
                "abstract": {
                    "text": s_abstract,
                    "tokens": len(nltk.word_tokenize(s_abstract)),
                },
                "full_text": {
                    "text": s_full,
                    "tokens": len(nltk.word_tokenize(s_full)),
                },
                "article": {
                    "text": s_article,
                    "tokens": len(nltk.word_tokenize(s_article)),
                }
            },
            "keywords": {
                "abstract": ex_keyword.get_keywords(i.get('original').get('abstract')),
                "full_text": ex_keyword.get_keywords(i.get('original').get('text')),
                "article": ex_keyword.get_keywords(i.get('article').get('text'))
            }
        }
        data.append(head)
    return data

In [120]:
summ_models=['facebook/bart-large-cnn', 'sshleifer/distilbart-cnn-12-6', 'philschmid/bart-large-cnn-samsum', 'google/pegasus-large', 'sshleifer/distill-pegasus-cnn-16-4', '', '']
keyword_models=['KEYBERT', 'YAKE', 'RAKE', 'POSITION', 'SINGLE', 'MULTIPARTITE', 'TOPIC']

summ = 'facebook/bart-large-cnn'
keyword = 'KEYBERT'

filer = Filer('/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/input.json')
json_data = filer.read_file()

for x,j in zip(summ_models, keyword_models):
    data = process_data(json_data, x, j)
    sun = x.split('/')[1]
    filer.write_file(data, f'/home/dxmonteiro/Desktop/WORKSPACE/ProfExtra/output_{sun}_{j}.json')


