# Multiple Article Summarisation
### Using a combination of Google's Universal Sentence Encoder (USE) and a clustering model, let's see if we can quickly understand the social space's content spread.

In [None]:
from Article import Art

In [7]:
import pandas as pd
pd.options.display.max_colwidth = 400
df = pd.read_csv('../DATA/westpactweets.csv')

In [29]:
base = df[df.propurl!="nourl"].\
    groupby('propurl').agg({'favs':'sum', 'rts':'sum', 'text':'count'}).\
    sort_values(['text','favs', 'rts'], ascending = False).reset_index()

In [None]:
##Load Universal Sentence Encoder
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# import tensorflow.compat.v1 as tf
# #To make tf 2.0 compatible with tf1.0 code, we disable the tf2.0 functionalities
# tf.disable_eager_execution()
# import tensorflow_hub as hub

# embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

# Import the Universal Sentence Encoder's TF Hub module
embed = hub.Module(module_url)

There is now a spacy implementation - follow installation set up at https://pypi.org/project/spacy-universal-sentence-encoder/

In [2]:
import spacy
encoder = spacy.load('en_use_lg')

Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB



In [4]:
dump = encoder('Hi there, how are you?')
# doc_2 = nlp('Hello there, how are you doing today?')
# # Inspect the shape of the Doc, Span and Token vectors
# print(doc_1.vector.shape) # the full document representation
# print(doc_1[3], doc_1[3].vector.shape) # the word "how"
# print(doc_1[3:6], doc_1[3:6].vector.shape)

In [6]:
dump.vector.shape

(512,)

In [8]:
print(f'{len(df.propurl.unique())} unique articles in the last week')

143 unique articles in the last week


In [13]:
##Grabbing summarised sentences from each article of the top 50
messages = {}
for url in df.propurl.unique()[:50]:
    if 'twitter' not in url:
        try:
            a = Art(url)
            a.summ()
            key = " ".join(a.sent_df.text)
            messages[url] = key
        except:
            pass

In [18]:
message_embeddings = [encoder(x).vector for x in list(messages.values())]

In [19]:
import numpy as np
from sklearn.cluster import KMeans

n_clusters = 5 ##how many sentences to summarise text - could make dynamic
kmeans = KMeans(n_clusters=n_clusters, random_state = 1234)
kmeans = kmeans.fit(message_embeddings)

In [20]:
from sklearn.metrics import pairwise_distances_argmin_min

avg = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, message_embeddings)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary = dict()
for i, idx in enumerate(ordering):
    summary[i]= list(messages.values())[closest[idx]]

In [21]:
inv_mess = {v: k for k, v in messages.items()}
def pretty(d, indent=0):
    for key, value in d.items():
        print('\n'+'\t' * indent + 'Cluster '+ str(key))
        if isinstance(value, dict):
            pretty(value, indent+1)
        else:
            print('\t' * (indent+1) + str(value) + '\n\n' + '\t' * (indent+1) + inv_mess[value]+'\n')

In [23]:
df[df.propurl=="https://en.wikipedia.org/w/index.php?diff=1023726321&oldid=1023617333"]

Unnamed: 0.1,Unnamed: 0,name,location,picurl,followers,dt,text,rts,favs,urls,hashtags,cleantext,propurl
24,24,bank-edits,,https://abs.twimg.com/images/themes/theme1/bg.png,1850,2021-05-18 00:37:58,Mexico at major beauty pageants Wikipedia article edited anonymously from Westpac Bank https://t.co/sDejvgJLLG https://t.co/GAUFSj4nPh,0,0,['https://t.co/sDejvgJLLG'],[],Mexico at major beauty pageants Wikipedia article edited anonymously from Westpac Bank https://t.co/GAUFSj4nPh,https://en.wikipedia.org/w/index.php?diff=1023726321&oldid=1023617333


In [22]:
pretty(summary)


Cluster 0
	Close Get email notifications on {{subject}} daily! Your notification has been saved. There was a problem saving your notification.

	https://www.tribdem.com/sports/scholastic-baseball-softball-roundup-portage-baseball-team-clinches-westpac-north-title/article_a4ed8f66-b692-11eb-a959-0b827ccf8e00.html


Cluster 1
	This is a list of '''[[Mexico]]''''s official representatives and their placements at the [[Big Four international beauty pageants|'''Big Four''' international beauty pageants]], considered the most important in the world. The country has a total of '''sixty-three placements''' and '''six victories''': This is a list of '''[[Mexico]]''''s official representatives and their placements at the [[Big Four international beauty pageants|'''Big Four''' international beauty pageants]], considered the most important in the world. The country has a total of '''sixty-three placements''' and '''six victories''': Mexico at Miss Universe, Miss World, Miss International, and Mis

In [24]:
list(zip(list(messages.values()), kmeans.labels_))[:4]

[('"Of course, the topic of the future of the [Westpac] New Zealand arm came up, but we\'re not in a position to comment further," the Government spokesperson said. In its interim results announcement earlier this month, King said Westpac is "continuing to assess what is in the best interests of shareholders regarding the ownership of our New Zealand business". While the Government is effectively a regulator of Westpac NZ, it\'s also a major client, as Westpac NZ provides the Government with the bulk of its banking services.',
  2),
 ('"We always have two goals that we want to do each season, and they are one ... let\'s put ourselves in a position to play for the WestPAC Championship, and two ... let\'s qualify for the district playoffs," Portage veteran coach Larry McCabe, who earned career win No. "That\'s a good ballclub right there," McCabe said of Berlin. Portage answered with a run in the bottom of the second to cut the lead to 2-1 on an RBI single by Kargo, scoring Claar, who si

In [34]:
clus = 2
base[base.propurl == inv_mess[summary[clus]]]

Unnamed: 0,propurl,favs,rts,text
61,https://www.interest.co.nz/banking/110452/westpac-group-ceo-and-chair-travel-australia-and-discuss-potential-sale-westpac-nz,1,1,1


In [35]:
df[df.propurl == inv_mess[summary[clus]]]

Unnamed: 0.1,Unnamed: 0,name,location,picurl,followers,dt,text,rts,favs,urls,hashtags,cleantext,propurl
6,6,Jenée Tibshraeny,"Wellington City, New Zealand",https://abs.twimg.com/images/themes/theme1/bg.png,3012,2021-05-18 02:58:31,"Westpac's top brass didn't wait long before travelling from Aus to meet with ""key stakeholders"" - Robertson, Ardern, Orr. \n\nWestpac Group CEO &amp; Chair were here last week. \n\nThe ownership of Westpac NZ is being reviewed. \nhttps://t.co/vhKM06jRef",1,1,['https://t.co/vhKM06jRef'],[],"Westpac's top brass didn't wait long before travelling from Aus to meet with ""key stakeholders"" - Robertson, Ardern, Orr. \n\nWestpac Group CEO &amp; Chair were here last week. \n\nThe ownership of Westpac NZ is being reviewed. \n",https://www.interest.co.nz/banking/110452/westpac-group-ceo-and-chair-travel-australia-and-discuss-potential-sale-westpac-nz
