In [1]:
import json
import gzip
import pandas as pd
import ast
    
test_df = pd.read_csv("../data/test.csv")
print(test_df.head())

                                        article_text  \
0  ["anxiety affects quality of life in those liv...   
1  ['small non - coding rnas are transcribed into...   
2  ['ohss is a serious complication of ovulation ...   
3  ['congenital adrenal hyperplasia ( cah ) refer...   
4  ['type 1 diabetes ( t1d ) results from the des...   

                                       abstract_text  
0  ["<S> research on the implications of anxiety ...  
1  ['<S> small non - coding rnas include sirna , ...  
2  ['<S> objective : to evaluate the efficacy and...  
3  ['<S> congenital adrenal hyperplasia is a grou...  
4  ['<S> objective(s):pentoxifylline is an immuno...  


# Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

nlp = spacy.load('en_core_web_lg')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx # for PageRank

# Evaluation Report

In [16]:
def evaluation_report(gold, pred):
    """Print ROUGE_1, BLEU_1, and F1 score.
    
    Args:
        gold: The set with the gold-standard values.
        pred: The set with the predicted values.
    
    Returns:
        Nothing, but prints the ROUGE_1, BLEU_1, and F1 values computed
        based on the specified sets.
    """
    freq_ROUGE = 0
    for each in gold:
        if each in pred:
            freq_ROUGE += 1
    # portion of the words from golden summary appering in the generated summary
    ROUGE = freq_ROUGE/len(gold) 
    print("--------------ROUGE (Recall):")
    print(f"{round(ROUGE*100, 2)}%")
    
    # Brevity penalised frequency
    pred_count = {}
    for each in set(pred):
        pred_count[each] = min(gold.count(each), pred.count(each))

    freq_BLEU = sum(pred_count.values())
    
    # (with brevity penalty) portion of the words from generated summary appering in the generated summary
    BLEU = freq_BLEU/len(pred) 
    print("--------------BLEU (Precision):")
    print(f"{round(BLEU*100, 2)}%")

    f1 = 2*(ROUGE * BLEU)/(ROUGE + BLEU)
    print("--------------F1 score:")
    print(f"{round(f1*100, 2)}%")

# Preprocessing -- take one case as an example

In [17]:
text = ast.literal_eval(test_df['article_text'][20]) # Convert sttring representatino of a list to a list
abstract = ast.literal_eval(test_df['abstract_text'][20])

abstract = [sent.replace(" </S>", "") for sent in abstract]
abstract = [sent.replace("<S> ", "") for sent in abstract]

sentences = []
for sent in text:
    doc = nlp(sent)
    doc = [token.lemma_ for token in doc if token.is_stop == False] # stop word removal and lemmatisation"
    doc = [token for token in doc if token.isalpha() == True] # exlude non-alphabetic lemmas
    sentences.append(" ".join(doc))
    
corpus = sentences

# Abstract preprocessing

abstract_sentences_pro = []
for sent in abstract:
    doc = nlp(sent)
    doc = [token.lemma_ for token in doc if token.is_stop == False] # stop word removal and lemmatisation"
    doc = [token for token in doc if token.isalpha() == True] # exlude non-alphabetic lemmas
    abstract_sentences_pro.append(" ".join(doc))

# TextRank

In [18]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(f"Shape of X: {X.shape}")

# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((X.shape[0], X.shape[0]))
for ind_1 in range(X.shape[0]):
    for ind_2 in range(ind_1, X.shape[0]):
        sim_mat[ind_1, ind_2] = cosine_similarity(X[ind_1, :], X[ind_2, :])
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
            
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph)

sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

top_sent_pre = [] # before preprocessing
top_sent_pro = [] # after preprocessing
for k, v in sorted_scores.items():
    if len(top_sent_pre) < len(abstract):
        print(k)
        top_sent_pre.append(text[k])
        top_sent_pro.append(sentences[k])
    else:
        break
        
print("\n-------------Generated summary:")
print(top_sent_pre)

print("\n-------------Real abstract:")
print(abstract)

Shape of X: (76, 599)
18
54
11
41

-------------Generated summary:
['therefore , it was our objective to assess the potential antimicrobial activity of aristolochia bracteolata using a bioassay - guided fractionation , in order to produce pure compound that can act as the lead compound in developing new , safe , and effective drug to replace the use of the harmful crude plant material .', 'the resulting fractions were tested for antibacterial and antifungal activities . the crude extract and chloroform fraction were significantly active against sea urchin - derived bacillus sp . and both standard strain and clinical isolates of moraxella catarrhalis and were moderately active against s. aureus , b. subtilis , and ps .', 'organic solvent extracts of the plant showed antibacterial activities while the water extract showed antifungal activity .', 'the sterile paper discs ( 6  mm in diameter ) which were impregnated with the plant extract ( 14  mg ) and pure compound ( 10100  g ) were plac

TextRank features longer sentences

### Evaluation

In [19]:
gen_abst = " ".join(top_sent_pro)
gol_abst = " ".join(abstract_sentences_pro)

# Unigrams

gen_abst_1 = gen_abst.split()
gol_abst_1 = gol_abst.split()

print("\nEvaluation of unigrams:")
evaluation_report(gol_abst_1, gen_abst_1)

# Bigrams

gen_abst_2 = []
gol_abst_2 = []

for i in range(len(gol_abst_1) - 1):
    gol_abst_2.append(gol_abst_1[i] + " " + gol_abst_1[i + 1])
for i in range(len(gen_abst_1) - 1):
    gen_abst_2.append(gen_abst_1[i] + " " + gen_abst_1[i + 1])
    
print("\nEvaluation of bigrams:")
evaluation_report(gol_abst_2, gen_abst_2)

# Trigrams
# Trigrams

gen_abst_3 = []
gol_abst_3 = []

for i in range(len(gol_abst_1) - 2):
    gol_abst_3.append(gol_abst_1[i] + " " + gol_abst_1[i + 1] +  " " + gol_abst_1[i + 2])
for i in range(len(gen_abst_1) - 2):
    gen_abst_3.append(gen_abst_1[i] + " " + gen_abst_1[i + 1] +  " " + gen_abst_1[i + 2])
    
print("\nEvaluation of trigrams:")
evaluation_report(gol_abst_3, gen_abst_3)


Evaluation of unigrams:
--------------ROUGE (Recall):
51.52%
--------------BLEU (Precision):
43.68%
--------------F1 score:
47.27%

Evaluation of bigrams:
--------------ROUGE (Recall):
21.43%
--------------BLEU (Precision):
18.6%
--------------F1 score:
19.92%

Evaluation of trigrams:
--------------ROUGE (Recall):
10.31%
--------------BLEU (Precision):
10.59%
--------------F1 score:
10.45%


# WordRank

In [20]:
words = []
for sent in sentences:
    [words.append(word) for word in sent.split()]
    
words = list(set(words)) # Find unique words in all processed sentences

# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((len(words), len(words)))
for ind_1 in range(len(words)):
    for ind_2 in range(ind_1, len(words)):
        sim_mat[ind_1, ind_2] = cosine_similarity(nlp.vocab[words[ind_1]].vector.reshape(1, 300), nlp.vocab[words[ind_2]].vector.reshape(1, 300))
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
            
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph, tol=1.0e-2)

# Find sentence scores as sum of all included word scores
sent_scores = {}

for ind in range(len(sentences)):
    w_score = 0
    for word in sentences[ind].split():
        w_score += scores[np.where(np.array(words) == word)[0][0]]
    sent_scores[ind] = w_score
    
sorted_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))

top_sent_word_pre = []
top_sent_word_pro = []
for k, v in sorted_scores.items():
    if len(top_sent_word_pre) < len(abstract):
        top_sent_word_pre.append(text[k])
        top_sent_word_pro.append(sentences[k])
    else:
        break

### Evaluation

In [21]:
gen_abst_word = " ".join(top_sent_word_pro)

# Unigrams

gen_abst_word_1 = gen_abst_word.split()

print("\nEvaluation of unigrams:")
evaluation_report(gol_abst_1, gen_abst_word_1)

# Bigrams
gen_abst_word_2 = []

for i in range(len(gen_abst_1) - 1):
    gen_abst_word_2.append(gen_abst_word_1[i] + " " + gen_abst_word_1[i + 1])
    
print("\nEvaluation of bigrams:")
evaluation_report(gol_abst_2, gen_abst_word_2)

# Trigrams
gen_abst_word_3 = []

for i in range(len(gen_abst_word_1) - 2):
    gen_abst_word_3.append(gen_abst_word_1[i] + " " + gen_abst_word_1[i + 1] +  " " + gen_abst_word_1[i + 2])
    
print("\nEvaluation of trigrams:")
evaluation_report(gol_abst_3, gen_abst_word_3)


Evaluation of unigrams:
--------------ROUGE (Recall):
35.35%
--------------BLEU (Precision):
14.01%
--------------F1 score:
20.07%

Evaluation of bigrams:
--------------ROUGE (Recall):
1.02%
--------------BLEU (Precision):
1.16%
--------------F1 score:
1.09%

Evaluation of trigrams:
--------------ROUGE (Recall):
2.06%
--------------BLEU (Precision):
0.65%
--------------F1 score:
0.98%


# Rerun TextRank with embegginds

## BoW

In [22]:
sent_vec = {}

for ind in range(len(sentences)):
    bow = np.zeros((1, 300))
    for word in sentences[ind].split():
        bow += nlp.vocab[word].vector.reshape(1, 300)
    sent_vec[ind] = bow

In [23]:
sim_mat = np.zeros((len(sentences), len(sentences)))
for ind_1 in range(len(sentences)):
    for ind_2 in range(ind_1, len(sentences)):
        sim_mat[ind_1, ind_2] = cosine_similarity(sent_vec[ind_1], sent_vec[ind_2])
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]

sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph, tol=1.0e-2)

sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

top_sent_emb_pre = []
top_sent_emb_pro = []
for k, v in sorted_scores.items():
    if len(top_sent_emb_pre) < len(abstract):
        top_sent_emb_pre.append(text[k])
        top_sent_emb_pro.append(sentences[k])
    else:
        break

### Evaluation

In [24]:
gen_abst_emb = " ".join(top_sent_emb_pro)

# Unigrams

gen_abst_emb_1 = gen_abst_emb.split()

print("\nEvaluation of unigrams:")
evaluation_report(gol_abst_1, gen_abst_emb_1)

# Bigrams
gen_abst_emb_2 = []

for i in range(len(gen_abst_emb_1) - 1):
    gen_abst_emb_2.append(gen_abst_emb_1[i] + " " + gen_abst_emb_1[i + 1])
    
print("\nEvaluation of bigrams:")
evaluation_report(gol_abst_2, gen_abst_emb_2)

# Trigrams
gen_abst_emb_3 = []

for i in range(len(gen_abst_emb_1) - 2):
    gen_abst_emb_3.append(gen_abst_emb_1[i] + " " + gen_abst_emb_1[i + 1] +  " " + gen_abst_emb_1[i + 2])
    
print("\nEvaluation of trigrams:")
evaluation_report(gol_abst_3, gen_abst_emb_3)


Evaluation of unigrams:
--------------ROUGE (Recall):
72.73%
--------------BLEU (Precision):
49.55%
--------------F1 score:
58.94%

Evaluation of bigrams:
--------------ROUGE (Recall):
35.71%
--------------BLEU (Precision):
27.27%
--------------F1 score:
30.93%

Evaluation of trigrams:
--------------ROUGE (Recall):
21.65%
--------------BLEU (Precision):
18.35%
--------------F1 score:
19.86%


# Hybrid

### Reduce text using WordRank

In [25]:
words = []
for sent in sentences:
    [words.append(word) for word in sent.split()]
    
words = list(set(words))

# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((len(words), len(words)))
for ind_1 in range(len(words)):
    for ind_2 in range(ind_1, len(words)):
        #sim_mat[ind_1, ind_2] = cosine_similarity(vectorizer.transform([words[ind_1]]), vectorizer.transform([words[ind_2]]))
        sim_mat[ind_1, ind_2] = cosine_similarity(nlp.vocab[words[ind_1]].vector.reshape(1, 300), nlp.vocab[words[ind_2]].vector.reshape(1, 300))
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
        
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph, tol=1.0e-2)

sent_scores = {}

for ind in range(len(sentences)):
    w_score = 0
    for word in sentences[ind].split():
        w_score += scores[np.where(np.array(words) == word)[0][0]]
    sent_scores[ind] = w_score
    
sorted_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))

### Run TextRank on these 64% sentences

In [26]:
# Retrieve important sentences indices

new_sent_ind = []
count = 0

for k in sorted_scores.keys():
    if count > len(sorted_scores)*0.64:
        break
    else:
        new_sent_ind.append(k)
        count += 1

# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((len(new_sent_ind), len(new_sent_ind)))
for ind_1 in range(len(new_sent_ind)):
    for ind_2 in range(ind_1, len(new_sent_ind)):
        sim_mat[ind_1, ind_2] = cosine_similarity(X[ind_1, :], X[ind_2, :])
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
            
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph)

sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

top_sent_hyb_pre = []
top_sent_hyb_pro = []
for k, v in sorted_scores.items():
    if len(top_sent_hyb_pre) < len(abstract):

        top_sent_hyb_pre.append(text[k])
        top_sent_hyb_pro.append(sentences[k])
    else:
        break
        
        
        
gen_abst_hyb = " ".join(top_sent_hyb_pro)

# Unigrams

gen_abst_hyb_1 = gen_abst_hyb.split()

print("\nEvaluation of unigrams:")
evaluation_report(gol_abst_1, gen_abst_hyb_1)

# Bigrams
gen_abst_hyb_2 = []

for i in range(len(gen_abst_hyb_1) - 1):
    gen_abst_hyb_2.append(gen_abst_hyb_1[i] + " " + gen_abst_hyb_1[i + 1])
    
print("\nEvaluation of bigrams:")
evaluation_report(gol_abst_2, gen_abst_hyb_2)

# Trigrams
gen_abst_hyb_3 = []


for i in range(len(gen_abst_hyb_1) - 2):
    gen_abst_hyb_3.append(gen_abst_hyb_1[i] + " " + gen_abst_hyb_1[i + 1] +  " " + gen_abst_hyb_1[i + 2])
    
print("\nEvaluation of trigrams:")
evaluation_report(gol_abst_3, gen_abst_hyb_3)


Evaluation of unigrams:
--------------ROUGE (Recall):
36.36%
--------------BLEU (Precision):
33.78%
--------------F1 score:
35.03%

Evaluation of bigrams:
--------------ROUGE (Recall):
11.22%
--------------BLEU (Precision):
9.59%
--------------F1 score:
10.34%

Evaluation of trigrams:
--------------ROUGE (Recall):
2.06%
--------------BLEU (Precision):
1.39%
--------------F1 score:
1.66%


### Run TextRank on these 80% sentences

In [27]:
words = []
for sent in sentences:
    [words.append(word) for word in sent.split()]
    
words = list(set(words))

# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((len(words), len(words)))
for ind_1 in range(len(words)):
    for ind_2 in range(ind_1, len(words)):
        #sim_mat[ind_1, ind_2] = cosine_similarity(vectorizer.transform([words[ind_1]]), vectorizer.transform([words[ind_2]]))
        sim_mat[ind_1, ind_2] = cosine_similarity(nlp.vocab[words[ind_1]].vector.reshape(1, 300), nlp.vocab[words[ind_2]].vector.reshape(1, 300))
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
        
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph, tol=1.0e-2)

sent_scores = {}

for ind in range(len(sentences)):
    w_score = 0
    for word in sentences[ind].split():
        w_score += scores[np.where(np.array(words) == word)[0][0]]
    sent_scores[ind] = w_score
    
sorted_scores = dict(sorted(sent_scores.items(), key=lambda item: item[1], reverse=True))

# Retrieve important sentences indices

new_sent_ind = []
count = 0

for k in sorted_scores.keys():
    if count > len(sorted_scores)*0.8:
        break
    else:
        new_sent_ind.append(k)
        count += 1
        
# Calculate the cosine similarity -- store result in a similarity matrix
sim_mat = np.zeros((len(new_sent_ind), len(new_sent_ind)))
for ind_1 in range(len(new_sent_ind)):
    for ind_2 in range(ind_1, len(new_sent_ind)):
        sim_mat[ind_1, ind_2] = cosine_similarity(X[ind_1, :], X[ind_2, :])
        sim_mat[ind_2, ind_1] = sim_mat[ind_1, ind_2]
            
sim_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(sim_graph)

sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

top_sent_hyb_pre = []
top_sent_hyb_pro = []
for k, v in sorted_scores.items():
    if len(top_sent_hyb_pre) < len(abstract):

        top_sent_hyb_pre.append(text[k])
        top_sent_hyb_pro.append(sentences[k])
    else:
        break
        
        
        
gen_abst_hyb = " ".join(top_sent_hyb_pro)

# Unigrams

gen_abst_hyb_1 = gen_abst_hyb.split()

print("\nEvaluation of unigrams:")
evaluation_report(gol_abst_1, gen_abst_hyb_1)

# Bigrams
gen_abst_hyb_2 = []

for i in range(len(gen_abst_hyb_1) - 1):
    gen_abst_hyb_2.append(gen_abst_hyb_1[i] + " " + gen_abst_hyb_1[i + 1])
    
print("\nEvaluation of bigrams:")
evaluation_report(gol_abst_2, gen_abst_hyb_2)

# Trigrams
gen_abst_hyb_3 = []


for i in range(len(gen_abst_hyb_1) - 2):
    gen_abst_hyb_3.append(gen_abst_hyb_1[i] + " " + gen_abst_hyb_1[i + 1] +  " " + gen_abst_hyb_1[i + 2])
    
print("\nEvaluation of trigrams:")
evaluation_report(gol_abst_3, gen_abst_hyb_3)


Evaluation of unigrams:
--------------ROUGE (Recall):
51.52%
--------------BLEU (Precision):
43.68%
--------------F1 score:
47.27%

Evaluation of bigrams:
--------------ROUGE (Recall):
21.43%
--------------BLEU (Precision):
18.6%
--------------F1 score:
19.92%

Evaluation of trigrams:
--------------ROUGE (Recall):
10.31%
--------------BLEU (Precision):
10.59%
--------------F1 score:
10.45%
