### Import Libraries and Models

In [1]:
from pprint import pprint
from spacy_stanza import StanzaLanguage
import stanza
import spacy
import nltk
import tensorflow as tf
import re
import random
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import requests
from bs4 import BeautifulSoup
import html2text
from summarizer import Summarizer
from textblob import TextBlob
stop_words = stopwords.words('english')
snlp = stanza.Pipeline(lang="en")
stanza_nlp = StanzaLanguage(snlp)
spacy_nlp = spacy.load('en_core_web_lg')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
2020-05-13 17:31:24 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-05-13 17:31:24 INFO: Use device: cpu
2020-05-13 17:31:24 INFO: Loading: tokenize
2020-05-13 17:31:24 INFO: Loading: pos
2020-05-13 17:31:25 INFO: Loading: lemma
2020-05-13 17:31:26 INFO: Loading: depparse
2020-05-13 17:31:27 INFO: Loading: ner
2020-05-13 17:31:28 INFO: Done loading processors!


In [2]:
"""
vgi global media
"""

'\nWINSON OIL TRADING PTE. LTD\n'

### Defining Client Profile Summary from Documentum

In [17]:
fromkyc = "VGI is a unique market leader with exclusive access to behavioral data from our advertising, payment and logistics platforms. \
We turn data to meaningful consumer insight, enabling us to offer Offline-to-Online (O2O) Solutions. \
To provide a better customer experience, we help brands to navigate their customers at every stage of the purchasing process.\
Their main business is Payment, which they have joined hand with the leading partners including BTS, AIS, Line, Kerry Express.\
VGI successfully acquired 23% in Kerry Express – the leading parcel delivery in Thailand."

kyc_doc = spacy_nlp(fromkyc.strip())

### Source URL list to crawl

In [28]:
source_url_list = [
    "https://www.reuters.com/companies/VGIn.BK",
    "https://www.zoominfo.com/c/Vgi-global-media-plc/372153320"
]

### HTML Parsing and Text Cleaning

In [29]:
def text_cleaning(raw_text):
    raw_text_list = raw_text.split('\n')
#     raw_text_list = [
#         token for token in raw_text_list if token not in stop_words
#     ]
    clean_sent_list = [
        re.sub('[^A-Za-z0-9]+\.-/', '', token) for token in raw_text_list
        if bool(token)
    ]
    clean_sent = ' '.join(clean_sent_list)
    clean_sent = ' '.join(clean_sent.split())
    doc = stanza_nlp(clean_sent)

    spacy_text_list = []
    for sent in doc.sents:
        spacy_text_list.append(sent.text)
    import random
    #     spacy_text_list = random.sample(spacy_text_list, len(spacy_text_list))
    return spacy_text_list


def tag2text(tag):

    if tag.name == 'p':
        return tag.text


def parse_article(text):
    soup = BeautifulSoup(text, 'html.parser')

    # find the article title
    h1 = soup.find('h1')

    # find the common parent for <h1> and all <p>s.
    root = h1
    while root.name != 'body':
        if root.parent == None:
            break
        root = root.parent

    # find all the content elements.
    ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre'])
    ps.insert(0, h1)
    content = [tag2text(p) for p in ps]
    content = [x for x in content if bool(x)]
    return content

### Creating Corpus of Information from WEB

In [30]:
def gather_content_data(url_list):
    assert len(url_list) > 0
    corpus = []
    for url in url_list:
        print(url)
        content = parse_article(requests.get(url).text)
        if bool(content):
            corpus.append(' '.join(content))
    spacy_text_list = text_cleaning(' '.join(corpus))
    return ' '.join(spacy_text_list)

In [31]:
corpus = gather_content_data(url_list=source_url_list)

https://www.reuters.com/companies/VGIn.BK
https://www.zoominfo.com/c/Vgi-global-media-plc/372153320


In [32]:
corpus

"Discover Thomson Reuters More VGIn.BK Latest Trade Change Today's Range - 52 Week Range - As of on the Stock Exchange of Thailand (Bangkok) ∙ Minimum 15 minute delay Profile News Key Developments Charts People Financials Key Metrics Events All Listings VGI Approves Interim Dividend Of 0.045 Baht Per Share VGI Pcl Posts Qtrly Total Revenues 1.93 Bln Baht Vs 1.53 Bln Baht VGI Pcl Posts Qtrly Profit For The Period 382.1 Mln Baht Vs 285.5 Mln Baht VGI Public Company Limited, formerly VGI Global Media Public Company Limited, is a Thailand-based Company engaged in the provision of marketing and advertising services. The Company’s business activities are divided into three groups: mass transit, modern trade, and multimedia. Mass transit media consists of Bangkok mass transit system (BTS) merchandising, which refers to the shops on the BTS stations, BTS advertising, Chulalongkorn University (CU) media, which comprises bus shelter and bus body media, and airport rail link. Modern trade include

### BERT based Text Summarization

In [34]:
model = Summarizer()

In [35]:
result = model(corpus, min_length=30, algorithm='gmm',ratio=0.5, max_length=len(corpus))
full = ''.join(result)
print(full)
testimonial = TextBlob(full)
print('\n Polarity of Article:', testimonial.sentiment.polarity)

Discover Thomson Reuters More VGIn. The Company’s business activities are divided into three groups: mass transit, modern trade, and multimedia. Modern trade includes sales floor media, which is a choice of various formats of media located at the point of purchase or nearby, and non sale floor media located from the entrance area of the stores, car parks and mall areas, such as Tesco Lotus and Big C. Multimedia is the digital screen networks across the country, such as BTS and office towers. Industry Advertising Contact Info Floor 9, TST Tower 21 Wiphawadirangsit Road Chomphon, Chatuchak 10900 Thailand +66.22.738884 Executive Leadership Khiri Karnchanaphat Chairman of the Board Kawin Karnchanaphat Chairman of the Executive Board and Director Lap Shun Leung Chief Executive Officer Supharanan Tanwirat Chief Financial Officer and Executive Director Darani Phanklin Managing Director of Finance 2017 2018 2019 2020(E) 2017 2018 2019 2020(E) New Stories * TO INVEST IN 23 PERCENT OF SHARES IN 

### Similarity Score Calculation using spaCy

In [36]:
extraction = spacy_nlp(full)
similarity_score = extraction.similarity(kyc_doc)
print('The Similarity Score of Summarized Text is: ', similarity_score*100)

The Similarity Score of Summarized Text is:  91.2161212390825
