In [1]:
#@title Setup Environment
#!pip install --quiet tensorflow==1.15.0
#!pip install --quiet tensorflow_hub==0.5.0
#!pip install --quiet tf_sentencepiece==0.1.86
#!pip install --quiet googletrans==2.4.0
#!pip install --quiet japanize-matplotlib==1.0.4
#!pip install --quiet mecab-python3
#!pip install --quiet https://github.com/megagonlabs/ginza/releases/download/v1.0.2/ja_ginza_nopn-1.0.2.tgz
#!pip install --quiet https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
    
#!ln -s /usr/local/lib/python3.6/dist-packages/ja_ginza_nopn /usr/local/lib/python3.6/dist-packages/spacy/data/ja_ginza_nopn

In [2]:
#@title Setup common imports and functions
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import tf_sentencepiece
from googletrans import Translator
import numpy as np
import matplotlib.pyplot as plt
#import japanize_matplotlib
import seaborn as sns
import spacy
from IPython.display import HTML
from sklearn import manifold
from sklearn.metrics.pairwise import cosine_distances

%matplotlib inline


def ncossim(embs_1, embs_2, axis=0):
    sims = np.inner(embs_1, embs_2)
    std = np.std(sims, axis=axis)
    ex = np.mean((sims-np.min(sims, axis=axis))/np.max(sims, axis=axis), axis=axis)
    return 0.5 + (sims-ex)/std


def mmr(doc_emb, cand_embs, key_embs):
    param = 0.5
    scores = param * ncossim(cand_embs, doc_emb, axis=0)
    if key_embs is not None:
        scores -= (1-param) * np.max(ncossim(cand_embs, key_embs), axis=1).reshape(scores.shape[0], -1)
    return scores

'''
def embedrank(doc_emb, sent_embs, n_keys):
    assert 0 < n_keys, 'Please `key_size` value set more than 0'
    assert n_keys < len(sent_embs), 'Please `key_size` value set lower than `#sentences`'
    sims = np.inner(doc_emb, sent_embs).reshape(-1)
    return np.argsort(-sims)[:n_keys]
'''

def embedrankpp(doc_emb, sent_embs, n_keys):
    assert 0 < n_keys, 'Please `key_size` value set more than 0'
    assert n_keys < len(sent_embs), 'Please `key_size` value set lower than `#sentences`'
    cand_idx = list(range(len(sent_embs)))
    key_idx = []
    while len(key_idx) < n_keys:
        cand_embs = sent_embs[cand_idx]
        key_embs = sent_embs[key_idx] if len(key_idx) > 0 else None
        scores = mmr(doc_emb, cand_embs, key_embs)
        key_idx.append(cand_idx[np.argmax(scores)])
        cand_idx.pop(np.argmax(scores))
    return key_idx

In [3]:
from tensorflow import keras
#@title Build a model
encoder = 'universal-sentence-encoder-multilingual'
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/1'



g = tf.Graph()
with g.as_default():
    text_input = tf.placeholder(dtype=tf.string, shape=[None])
    xling_embed = hub.Module(module_url)
    embedded_text = xling_embed(text_input)
    init_options = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

session = tf.Session(graph=g)
session.run(init_options)

    
ranker = 'EmbedRank++' 
rank_fn = embedrankpp

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [4]:
# Document from Wikipedia
'''
doc = ["""
I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit.
It was the first computer with beautiful typography.
I had been rejected, but I was still in love.
Your work is going to fill a large part of your life, and the only way to be truly satisfied is to do what you believe is great work.
I didn’t even know what a pancreas was.
When I was young, there was an amazing publication called The Whole Earth Catalog, which was one of the bibles of my generation.
The major limitation is that standard language models are unidirectional, and this limits the choice of architectures that can be used during pre-training.
"""]'''

from src.main import long_summary

doc = []
doc.append(long_summary)

In [5]:
#@title Language detection and sentence segmentation
translator = Translator()
detected_lang = translator.detect(''.join(doc))

assert detected_lang.lang in ['en'], 'Please, input Japanese text or English text'
detected_lang.lang = 'en':
tokenizer = spacy.load('en_core_web_sm')

sents = [str(s).replace('\n', '') for s in tokenizer(''.join(doc)).sents]
#print(f'Language: {detected_lang.lang}')
#print(f'#sentences: {len(sents)}')

Language: en
#sentences: 8


In [6]:
#@title Model run
key_size = 3 #@param {type:"integer"}

# Embedding
doc_emb= session.run(embedded_text, feed_dict={text_input: doc})
sent_embs= session.run(embedded_text, feed_dict={text_input: sents})

# Ranking
keys = rank_fn(doc_emb, sent_embs, key_size)

In [18]:
'''
#@title Display
display_sents = []

for i, s in enumerate(sents):
    line = '<font color="#CD5C5C"><strong>' + s + '</strong></font>' if i in keys else s
    display_sents.append(line)

HTML(''.join(display_sents))
'''


In [19]:
summary = []
for i, s in enumerate(sents):
    if i in keys:
        summary.append(s) 
    else: pass
summary

['I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit.',
 'It was the first computer with beautiful typography.',
 'When I was young, there was an amazing publication called The Whole Earth Catalog, which was one of the bibles of my generation.']