In [22]:
from sparsesvd import sparsesvd
from scipy.sparse import csc_matrix
import numpy as np
import pickle
import pandas as pd

In [108]:
import os.path
from gensim import corpora
from gensim import matutils
from gensim.models import LsiModel, LdaModel, RpModel, HdpModel, TfidfModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stop_words = list(stopwords.words('english'))

In [218]:
def remove_unicode(text):

    text = str(text).encode("ascii", "ignore")
    text = text.decode()

    return text

In [261]:
def preprocess_data(document, en_stemmer, en_stopwords):
    
    tokenizer = RegexpTokenizer(r'\w+')
    texts = [] 
    
    for sentence in document:
        
        sentence = str(sentence).lower()
        sentence = remove_unicode(sentence)
        sentence_token = tokenizer.tokenize(sentence)
        sentence_token = [en_stemmer.stem(i) for i in sentence_token if not i in en_stopwords]
        texts.append(sentence_token)
        
    return texts

In [262]:
def prepare_corpus(document):
    
    dictionary = corpora.Dictionary(document)
    freq_matrix = [dictionary.doc2bow(doc) for doc in document]

    return dictionary, freq_matrix

In [263]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):

    coherence_values = []
    model_list = []
    
    for num_topics in range(start, stop, step):
       
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    return model_list, coherence_values

In [264]:
def get_V(model, freq_matrix):
    
    V = matutils.corpus2dense(model[freq_matrix], len(model.projection.s)).T / model.projection.s
    
    return V

In [265]:
def format_data(corpus_model, k):
    
    aux = []
    for j, docv in enumerate(corpus_model):

        for i in range(k):
            
            try:
                aux.append([j, docv[i][0], docv[i][1]])
            except IndexError:
                pass
    
    name_columns = ['sentence', 'topic', 'weight']
    df = pd.DataFrame(aux, columns=name_columns)
    return df

In [266]:
def sentence_ranking(df):
    
    ranking = {}
    for i in range(k):
        ranking[i] = df.loc[df['topic'] == i].sort_values('weight')['sentence'].tolist()
        
    return ranking

In [267]:
def get_top_sentences(ranking, n_sentences=2):

    sentences = []
    for key in ranking.keys():
        count = 1 
        for i in range(n_sentences):
            
            try:
                sentence_id = ranking.get(key)[i]

                if not sentence_id in sentences:
                    sentences.append(ranking.get(key)[i])
                elif not count == len(ranking.get(key)):
                    sentences.append(ranking.get(key)[i+count])
                count+=1
            except IndexError:
                pass
            
    return sentences

In [268]:
def create_summary(text_sent, sentences_id):
    
    summary = [text_sent[i] for i in sentences_id]
    
    return " ".join(summary)

In [269]:
def main_topic_models(text, k=2, words=2, name_models=['lsi']):
    
    models = {}
    corpus = {}
    
    en_stopwords = set(stopwords.words('english'))
    en_stemmer = PorterStemmer()
    
    text_clean = preprocess_data(text, en_stemmer, en_stopwords)
    dictionary, freq_matrix = prepare_corpus(text_clean)

    for name_model in name_models:
        
        if name_model == 'lsi':
            model = LsiModel(freq_matrix, id2word=dictionary, num_topics=k)
            models[name_model] = model
        elif name_model == 'lda':
            model = LdaModel(freq_matrix, id2word=dictionary, num_topics=k)
            models[name_model] = model
        elif name_model == 'hdp':
            model = HdpModel(freq_matrix, id2word=dictionary)
            models[name_model] = model
            
        corpus[name_model] = model[freq_matrix]

    return models, corpus

In [270]:
def main_summarization(corpus, name_model, n_sentences=2, k=2):
    
    df = format_data(corpus[name_model], k)
    ranking = sentence_ranking(df)
    sentences_id = get_top_sentences(ranking, n_sentences=n_sentences)
    summary = create_summary(text, sentences_id)

    return summary, ranking, sentences_id

In [271]:
def load_data(section):
    section='conclusion'

    with open('dataset6_{}.pkl'.format('features'), 'rb') as fp:
        dataset = pickle.load(fp)
    
    train = dataset[section][4][['sentences', 'articles', 'rouge_1']]
    test = dataset[section][5][['sentences', 'articles', 'rouge_1']]
    
    return train, test

In [None]:
train, test = load_data(section)
articles_id = pd.unique(train['articles'])

summaries = {'article_id': [], 'summary': [], 'sentences_id': []}

test['sentences'] = test['sentences'].astype('str')

for article_id in articles_id:

    #print(article_id)
    aux = test.loc[train['articles'] == article_id]
    
    text = aux['sentences'].tolist()
    models, corpus = main_topic_models(text, k=2, words=2, name_models=['lsi', 'lda'])
    summary, _, sentences_id = main_summarization(corpus, name_model='lda', n_sentences=2, k=2)
    
    summaries['summary'].append(summary)
    summaries['article_id'].append(article_id)
    summaries['sentences_id'].append(sentences_id)

In [276]:
lsi = pd.DataFrame(summaries)

In [279]:
lda = pd.DataFrame(summaries)

In [280]:
lda_test = pd.DataFrame(summaries)
lda_test

Unnamed: 0,article_id,summary,sentences_id
0,PMC2836490.json,Cannula placement was confirmed at the end of ...,"[3, 2, 9, 8]"
1,PMC2692135.json,Examining the prevalence of social anxiety acr...,"[15, 7, 10, 2]"
2,PMC2940415.json,Forty eight adult and pediatric CF patients wi...,"[0, 3, 4, 2]"
3,PMC6019182.json,our findings identify the presence of intratum...,"[6, 5, 2, 4]"
4,PMC2891222.json,Subchondral bone marrow edema like lesions (BM...,"[0, 9, 4, 10]"
...,...,...,...
10451,PMC6980738.json,10.1016/j.ygyno.2019.11.002 31776037,[0]
10452,PMC6986304.json,"After the occurrence of SCI in rats, the addit...","[0, 1, 0]"
10453,PMC6986779.json,31946748 Functional LGE imaging allows clear d...,"[1, 0, 1]"
10454,PMC6994319.json,Over ten years after CDCâs revised recommend...,"[0, 2, 1]"
