### Import Libraries and Models

In [1]:
from pprint import pprint
from spacy_stanza import StanzaLanguage
import stanza
import spacy
import nltk
import tensorflow as tf
import re
import random
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import requests
from bs4 import BeautifulSoup
import html2text
from summarizer import Summarizer
from textblob import TextBlob
stop_words = stopwords.words('english')
snlp = stanza.Pipeline(lang="en")
stanza_nlp = StanzaLanguage(snlp)
spacy_nlp = spacy.load('en_core_web_lg')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
2020-05-15 14:27:35 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-05-15 14:27:35 INFO: Use device: cpu
2020-05-15 14:27:35 INFO: Loading: tokenize
2020-05-15 14:27:35 INFO: Loading: pos
2020-05-15 14:27:36 INFO: Loading: lemma
2020-05-15 14:27:37 INFO: Loading: depparse
2020-05-15 14:27:38 INFO: Loading: ner
2020-05-15 14:27:39 INFO: Done loading processors!


In [2]:
"""
shv holdings
"""

'\nWINSON OIL TRADING PTE. LTD\n'

### Defining Client Profile Summary from Documentum

In [2]:
fromkyc = "SHV is a family-owned, decentralised company active in energy distribution, cash-and-carry wholesale, heavy lifting and transport activities, industrial services, animal nutrition and aquafeed, exploration, development and production of oil and gas and providing private equity through its seven companies: SHV Energy, Makro, Mammoet, ERIKS, Nutreco, ONE-Dyas and NPM Capital. SHV employs more than 60,000 people and is present in 58 countries."

kyc_doc = spacy_nlp(fromkyc.strip())

### Source URL list to crawl

In [14]:
source_url_list = [
    "https://www.referenceforbusiness.com/history2/94/SHV-Holdings-N-V.html",
    "https://www.fis.com/fis/companies/details.asp?l=e&company_id=158503",
    "https://www.wikiwand.com/en/SHV_Holdings"
]

### HTML Parsing and Text Cleaning

In [15]:
def text_cleaning(raw_text):
    raw_text_list = raw_text.split('\n')
    #     raw_text_list = [
    #         token for token in raw_text_list if token not in stop_words
    #     ]
    clean_sent_list = [
        re.sub('[^A-Za-z0-9]+\.-/', '', token) for token in raw_text_list
        if bool(token)
    ]
    clean_sent = ' '.join(clean_sent_list)
    clean_sent = ' '.join(clean_sent.split())
    doc = stanza_nlp(clean_sent)

    spacy_text_list = []
    for sent in doc.sents:
        spacy_text_list.append(sent.text)
    import random
    #     spacy_text_list = random.sample(spacy_text_list, len(spacy_text_list))
    return spacy_text_list


def tag2text(tag):

    if tag.name == 'p':
        return tag.text


def parse_article(text):
    soup = BeautifulSoup(text, 'html.parser')

    try:
        # find the article title
        h1 = soup.find('h1')

        # find the common parent for <h1> and all <p>s.
        root = h1
        while root.name != 'body':
            if root.parent == None:
                break
            root = root.parent

        # find all the content elements.
        ps = root.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre'])
        ps.insert(0, h1)
        content = [tag2text(p) for p in ps]
        content = [x for x in content if bool(x)]
        return content
    except Exception as e:
        print(e)
        return None

### Creating Corpus of Information from WEB

In [61]:
def gather_content_data(url_list):
    assert len(url_list) > 0
    corpus = []
    for url in url_list:
        print(url)
        content = parse_article(requests.get(url).text)
        if bool(content):
            corpus.append(' '.join(content))
    spacy_text_list = text_cleaning(' '.join(corpus))
    return ' '.join(spacy_text_list)

In [62]:
corpus = gather_content_data(url_list=source_url_list)

https://www.referenceforbusiness.com/history2/94/SHV-Holdings-N-V.html
https://www.fis.com/fis/companies/details.asp?l=e&company_id=158503
'NoneType' object has no attribute 'name'
https://www.wikiwand.com/en/SHV_Holdings


In [63]:
corpus

'Rijnkade 13511 LC Utrecht The Netherlands Company Perspectives: SHV is a privately held company and wishes to remain so. SHV is a decentralised company. Great trust is placed in our people in the field. This decentralisation provides an excellent opportunity for individual development. Mutual respect and trust provide the basis for happiness at work. SHV\'s most important values are integrity and loyalty. Integrity means being honest, genuine, and totally open in communications about all matters that concern the company. Good news may travel slowly, bad news should travel quickly. Loyalty means putting your best effort into your work for the company and its development. Based on the integrity and loyalty of our people, SHV wishes to continue to grow both for the benefit of our shareholders, our employees, and for the well-being of the society in which we live and work. SHV Holdings N.V. is one of the Netherlands\' largest private companies. The family-owned concern serves a holding co

In [82]:
0.5*np.floor(len(corpus)/len(fromkyc))

18.0

### BERT based Text Summarization

In [74]:
model = Summarizer()

In [75]:
result = model(corpus,
               min_length=10,
               max_length=100,
               algorithm='gmm',
               ratio=0.5)
full = ''.join(result)
print(full)
testimonial = TextBlob(full)
print('\n Polarity of Article:', testimonial.sentiment.polarity)



 Polarity of Article: 0.0


### Similarity Score Calculation using spaCy

In [66]:
extraction = spacy_nlp(full)
similarity_score = extraction.similarity(kyc_doc)
print('The Similarity Score of Summarized Text is: ', similarity_score * 100)

The Similarity Score of Summarized Text is:  91.93679763169389
