# Extractive Summarization

#### Required Libraries

In [1]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import operator
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [2]:
# Intializing the Word2Vec Model, download the file from https://nlp.stanford.edu/projects/glove/
# Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): glove.6B.zip 
# Unzip the file then run: python3 -m gensim.scripts.glove2word2vec --input  glove.6B.300d.txt --output glove.6B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.300d.w2vformat.txt')

In [3]:
word_freq_map = {}
with open("data/arxiv-release/vocab", 'r') as vocab_file:
    lines = vocab_file.readlines()
    for line in lines:
        word_freq_map[line.split()[0]] = int(line.split()[1])
    
stop_list = sorted(word_freq_map.items(), key=operator.itemgetter(1), reverse=True)[:150]
cache = {}

#### Read the full training data and split it into smaller chunks

In [4]:
def read_data_and_split(train_corpus_path):
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Loads a particular pickle file of the training data into memory

In [5]:
def load_data_from_pickle(train_corpus_path):
    data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
    data_map = {}
    with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
        data_map = pickle.load(handle)
    return data_map

#### Gets all the sentences of the article along with its metadata

In [53]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    summary_list = []
    abstract_list = []
    c = 0
    file_number = 1
    for article_id, data in tqdm(data_map.items()):
        c += 1
        if c == 13:
            abstract_list.append(data['abstract_text'])
            section_data = data['sections']
            section_names = data['section_names']
            for i, section in enumerate(section_data):
                for line in section:
                    split_line = line.split('.')
                    for l in split_line:
                        list_of_sentences.append(l)
                        sentence_metadata.append(section_names[i])
            break
            summary_list.append(do_stuff_and_get_summary(list_of_sentences, sentence_metadata))
            list_of_sentences.clear()

        else:
            continue
            c = 0
            write_summary_and_abstract_to_file(summary_list, abstract_list, file_number)
            print("File ", file_number, " done")
            file_number += 1
            summary_list.clear()
            abstract_list.clear()
    print(list_of_sentences)
    return list_of_sentences, sentence_metadata

#### The Following 2 functions are used for Preprocessing of a given text

In [7]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [8]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    stop_words = list(stop_words) + stop_list
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and '\\' not in word and len(word) > 1]            
    return cleaned_tokens

#### Gets the processed sentences for each sentence of the article

In [9]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

#### Gives the number of words common between given 2 sentences

In [10]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [11]:
def get_word_vec_sim(sentence1, sentence2):
    score = 0
    for word1 in sentence1:
        for word2 in sentence2:
            try:
                temp = cache[word1+word2]
            except:
                try:
                    temp = model.similarity(word1, word2)
                    cache[word1+word2] = temp
                    cache[word2+word1] = temp
                except:
                    cache[word1+word2] = 0
                    cache[word2+word1] = 0
                    temp = 0
            score += temp
    return score

#### Generic scoring function which gives a score between 2 sentences

In [12]:
def scoring(sentence1, sentence2, metadata):
    len_normalize = len(sentence1) + len(sentence2) + 1 # Normalizing by length of vector
    common_words = get_no_of_common_word(sentence1, sentence2)
    word_vec_score = get_word_vec_sim(sentence1, sentence2)
    score = common_words / 2*len_normalize + word_vec_score / len_normalize
    return score

#### Makes the graph which has relations between every pair of sentences

In [13]:
def make_graph(processed_sentences, metadata):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    sentence_common_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
                sentence_common_graph[i][j] = 0
            else:
                sentence_graph[i][j] = scoring(sentence1, sentence2, metadata)
                sentence_common_graph[i][j] = get_no_of_common_word(sentence1, sentence2)
    return sentence_graph, sentence_common_graph

#### Following functions are different ways to give a score to a sentence

##### (1) Aggregation

In [14]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

##### (2) Page Rank

In [15]:
def calculate_pagerank_scores(sentence_graph):
    N = len(sentence_graph)
    d = 0.15   # PageRank Hyperparameter
    pagerank_scores = np.ones(N)
    
    out_degree = np.zeros(N)
    for i in range(N):
        for j in range(N):
            if sentence_graph[i][j]:
                out_degree[i] += sentence_graph[i][j]
    
    for i in range(N):
        score = 0
        for j in range(N):
            if sentence_graph[j][i]:
                score += (pagerank_scores[j] / out_degree[j])
        pagerank_scores[i] = (d / N) + (1 - d) * score
    return pagerank_scores    

#### Ranks the sentences based on any one of the above scoring methods and return the Summary

In [16]:
def rank_sentences_and_make_summary2(sentences, processed_sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    summary = []
    for i in range(5):
        summary.append(sentences[ordered_sentences[i]])
#         print(ordered_sentences[i], scores[ordered_sentences[i]])
#         print(processed_sentences[ordered_sentences[i]])
    return summary

In [33]:
def rank_sentences_and_make_summary(sentences, processed_sentences, sentence_graph, scores, summary_length):
    summary = []
    for i in range(summary_length): # Number of Sentences we want in the summary
        score_indices = np.argsort(scores)
        if len(score_indices) < 1:
            break
        selected_index = score_indices[-1]
        summary.append(sentences[selected_index]) # Adding highest score sentence to summary
        mean_score = np.mean(sentence_graph)
        to_decrease = []
        # Calculated mean similarity score. If selected sentence and another sentence have
        # high similarity, the score of the second sentence should be reduced.
        # Here, have chosen to use 1.5 * mean_score as the threshold, and divided score in half.
        for iterator in range(len(processed_sentences)):
            if sentence_graph[iterator][selected_index] > 1.5 * mean_score:
                to_decrease.append(iterator)
            if sentence_graph[selected_index][iterator] > 1.5 * mean_score:
                to_decrease.append(iterator)
        for sentence in set(to_decrease):
            # Should be changed based on the number of sentences needed in the summary
            scores[sentence] /= (1 + 1.0 / summary_length) # Reduced score by half, to on average prevent from being picked.
        scores[selected_index] = 0
    return summary
        

#### Main Program which calls the above defined functions

In [18]:
train_corpus_path = "data/log_files/train.txt"

In [19]:
data_map = load_data_from_pickle(train_corpus_path)

In [20]:
len(data_map)

20000

In [54]:
def do_stuff_and_get_summary(list_of_sentences, sentence_metadata):
    list_of_sentences = [sentence.strip() for sentence in list_of_sentences if len(sentence) > 1]
    processed_sentences = make_processed_sentences(list_of_sentences)
    sentence_graph, sentence_common_graph = make_graph(processed_sentences, sentence_metadata)
    sentence_scores = calculate_scores(sentence_graph)
    sentence_page_scores = calculate_pagerank_scores(sentence_common_graph)
    sentence_score_final = [sentence_scores[i] * (sentence_page_scores[i]+1)for i in range(len(sentence_scores))]
    summary_length = 5
    summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_score_final, summary_length)
    return summary

In [22]:
def write_summary_and_abstract_to_file(summary_list, abstract_list, file_number):
    map_of_abstract_summary = {}
    
    abstract_map = {}
    for i, abstract in enumerate(abstract_list):
        abstract_map[i] = abstract
    
    summary_map = {}
    for i, summary in enumerate(summary_list):
        summary_map[i] = summary
    
    with open("data/map/abstract_file_" + str(file_number) + ".pickle", 'wb') as abs_file:
        pickle.dump(abstract_map, abs_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open("data/map/summary_file_" + str(file_number) + ".pickle", 'wb') as sum_file:
        pickle.dump(summary_map, sum_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Call this function and this function is now changed to generate summaries for all articles and store them to files

In [60]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)

  0%|          | 0/20000 [00:00<?, ?it/s]

['quantum - mechanical fluctuations during an early epoch of inflation provide a plausible mechanism to generate the energy - density perturbations responsible for observed cosmological structure ', ' while it has been known for quite some time that inflation is consistent with open spatial hypersurfaces ( gott 1982 ; guth &  weinberg 1983 ) , attention was initially focussed on models in which there are a very large number of @xmath17-foldings during inflation , resulting in almost exactly flat spatial hypersurfaces for the observable part of the present universe ( guth 1981 ; also see kazanas 1980 ; sato 1981a , b ) ', '', 'this was , perhaps , inevitable because of strong theoretical prejudice towards flat spatial hypersurfaces and their resulting simplicity ', '', 'however , to get a very large number of @xmath17-foldings during inflation it seems necessary that the inflation model have a small dimensionless parameter ( j', ' r', ' gott , private communication 1994 ; banks et al ',




In [61]:
summary = do_stuff_and_get_summary(list_of_sentences, sentence_metadata)
summary

['we consider four open model energy - density perturbation power spectra : ( 1 ) the open - bubble inflation model spectrum , accounting only for fluctuations that are generated during the evolution inside the bubble ( rp94 ) ; ( 2 ) the open - bubble inflation model spectrum , now also accounting for the fluctuations generated in the first epoch of inflation ( bgt ; yst ) ; ( 3 ) the open - bubble inflation model spectrum , now also accounting for both the usual fluctuations generated in the first epoch of inflation and a contribution from a non - square - integrable basis function ( yst ) ; and , ( 4 ) an open model with a flat - space scale - invariant spectrum ( w83 )',
 'grs discovered that ( given the uncertainties associated with the smaller - scale measurements ) the 1-@xmath80 uncertainty in the value of the dmr normalization precludes determination of robust constraints on model - parameter values , although the range of model - parameter space for the open - bubble inflatio

In [46]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)
list_of_sentences = [sentence.strip() for sentence in list_of_sentences if len(sentence) > 1]

  0%|          | 0/20000 [00:00<?, ?it/s]

['one surprising result that has come out of the more than 200 extrasolar planet discoveries to date is the wide range of eccentricities observed ', ' unlike our own solar system', ', many of the extrasolar planets which are not tidally locked to their host stars have moderate eccentricities ( @xmath1 ) , and 15 planets have high eccentricities ( @xmath0 ) ', '', 'these observations have spawned several theories as to the origin of highly eccentric extrasolar planets ', '', 'one such method , planet - planet scattering , occurs when multiple jovian planets form several astronomical units ( au ) from the host star and then interact , leaving one in an eccentric orbit and often ejecting the other @xcite ', '', 'this method has been proposed to explain the architecture of the @xmath2 and planetary system @xcite , which contains a hot jupiter as well as two jovian planets in moderately eccentric orbits ', '', '@xcite suggested a merger scenario in which inner protoplanets perturb each othe




In [47]:
processed_sentences = make_processed_sentences(list_of_sentences)
list_of_sentences


['one surprising result that has come out of the more than 200 extrasolar planet discoveries to date is the wide range of eccentricities observed',
 'unlike our own solar system',
 ', many of the extrasolar planets which are not tidally locked to their host stars have moderate eccentricities ( @xmath1 ) , and 15 planets have high eccentricities ( @xmath0 )',
 'these observations have spawned several theories as to the origin of highly eccentric extrasolar planets',
 'one such method , planet - planet scattering , occurs when multiple jovian planets form several astronomical units ( au ) from the host star and then interact , leaving one in an eccentric orbit and often ejecting the other @xcite',
 'this method has been proposed to explain the architecture of the @xmath2 and planetary system @xcite , which contains a hot jupiter as well as two jovian planets in moderately eccentric orbits',
 '@xcite suggested a merger scenario in which inner protoplanets perturb each other and merge to f

In [48]:
sentence_graph, sentence_common_graph = make_graph(processed_sentences, sentence_metadata)
sentence_graph

  if np.issubdtype(vec.dtype, np.int):


array([[ 0.00000000e+00,  4.66699958e-01,  4.14385930e+01, ...,
        -1.39285338e-02,  0.00000000e+00,  0.00000000e+00],
       [ 4.66699958e-01,  0.00000000e+00,  3.67676003e-01, ...,
        -4.33707815e-02,  0.00000000e+00,  0.00000000e+00],
       [ 4.14385930e+01,  3.67676003e-01,  0.00000000e+00, ...,
        -2.56720642e-02,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-1.39285338e-02, -4.33707815e-02, -2.56720642e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [49]:
sentence_scores = calculate_scores(sentence_graph)

In [50]:
sentence_page_scores = calculate_pagerank_scores(sentence_common_graph)

In [51]:
sentence_score_final = [sentence_scores[i] * (sentence_page_scores[i]+1)  for i in range(len(sentence_scores))]

In [52]:
summary_length = 5
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores, summary_length)
summary

['in particular , hd  3651 , hd  80606 , and hd  89744 were monitored intensely with the het as part of a search for short - period objects , whereas hd  37605 and hd  45350 were only observed sporadically after the known planet orbits were defined and published @xcite , and 16  cyg  b has only been observed with the hjs telescope at a frequency of at most once per month',
 'of the 15 stars hosting a planet with @xmath0 , six are also known to possess stellar - mass companions in wide binary orbits : hd  3651 @xcite , hd  20782 @xcite , hd  80606 , hd  89744 @xcite , 16  cyg  b , and hd  222582 @xcite',
 'section  2 describes the observations and the test particle simulations for six highly eccentric planetary systems : hd  3651 , hd  37605 , hd  45350 , hd  80606 , hd  89744 , and 16',
 'additional factors such as a paucity of data ( hd  37605 ) and short time baselines ( hd  80606 , hd  89744 ) made the determination of useful companion limits challenging for some of the planetary sy

In [None]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_score_final, summary_length)
summary

In [None]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_page_scores, summary_length)
summary
# make_processed_sentences(summary)

In [None]:
# Loading function
data_path = './data/map'
with open(data_path + "/" + "summary_file_1.pickle", 'rb') as handle:
    summary_map = pickle.load(handle)
len(summary_map)