In [1]:
from IPython.display import clear_output
!pip install pandas tqdm requests httplib2 google-cloud-language
clear_output()
print('All Installed.')

All Installed.


In [2]:
import sys
import httplib2
import pandas as pd
import logging
import time
from tqdm import tqdm
import warnings
import datetime as dt
import os
import string
from datetime import timedelta, date
from urllib.error import HTTPError
from IPython.display import clear_output
import requests
import json

from google.cloud import language
from google.oauth2 import service_account
from google.cloud.language import enums
from google.cloud.language import types

client = language.LanguageServiceClient.from_service_account_json('services.json')

In [3]:
def load_text_from_url(url, **data):
    
    '''
    Loads html from a URL
    Parameters:
        * url: url of page to load (str)
        * timeout: request timeout in seconds (int) default: 20
    Returns:
    HTML (str)
    '''
    
    timeout = data.get('timeout', 20)
    
    results = []
    
    try:
        
        print("Extracting text from: {}".format(url))
        response = requests.get(url, timeout=timeout)

        text = response.text
        status = response.status_code

        if status == 200 and len(text) > 0:
            file1 = open("myfile.txt","w")
            file1.write(text)
            file1.close()
            print("File generated")
            return text
            
        return None
        

    except Exception as e:
        print('Problem with url: {0}.'.format(url))
        return None

In [5]:
def pull_googlenlp(client, url, invalid_types = ['OTHER'], **data):
    
    '''
    Loads HTML from a URL and passes to the Google NLP API
    Parameters:
        * client: Authenticated Language Service Client.
        * url: url of page to load (str)
        * invalid_types: entitity types to ignore (eg. CONSUMER_GOOD, NUMBER) (list) default: OTHER
        * timeout: request timeout in seconds (int) default: 20
    Returns:
    Dictionary of sentiment(list), entities(list), categories(list) (dict)
    '''    

    html = load_text_from_url(url, **data)
    
    if not html:
        return None
    
    document = types.Document(
        content=html,
        type=language.enums.Document.Type.HTML )

    features = {'extract_syntax': True,
                'extract_entities': True,
                'extract_document_sentiment': True,
                'extract_entity_sentiment': True,
                'classify_text': False
                }
    
    response = client.annotate_text(document=document, features=features)
    sentiment = response.document_sentiment
    entities = response.entities
    
    response = client.classify_text(document)
    categories = response.categories
        
    def get_type(type):
        return client.enums.Entity.Type(entity.type).name
    
    result = {}
    
    result['sentiment'] = []    
    result['entities'] = []
    result['categories'] = []

    if sentiment:
        result['sentiment'] = [{ 'magnitude': sentiment.magnitude, 'score':sentiment.score }]
        
    for entity in entities:
        if get_type(entity.type) not in invalid_types:
            result['entities'].append({'name': entity.name, 'type': get_type(entity.type), 'salience': entity.salience, 'wikipedia_url': entity.metadata.get('wikipedia_url', '-')  })
            
    for category in categories:
        result['categories'].append({'name':category.name, 'confidence': category.confidence})
        
        
    return result


In [9]:
url = "https://opensource.com/article/19/6/how-ssh-running-container"
pull_googlenlp(client,url)

Extracting text from: https://opensource.com/article/19/6/how-ssh-running-container


{'sentiment': [{'magnitude': 79.80000305175781, 'score': 0.0}],
 'entities': [{'name': 'SSH',
   'type': 'CONSUMER_GOOD',
   'salience': 0.0326605923473835,
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Secure_Shell'},
  {'name': 'Seth Kenlon',
   'type': 'PERSON',
   'salience': 0.025843223556876183,
   'wikipedia_url': '-'},
  {'name': 'Linux',
   'type': 'CONSUMER_GOOD',
   'salience': 0.02205348014831543,
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Linux'},
  {'name': 'comments',
   'type': 'WORK_OF_ART',
   'salience': 0.01498269010335207,
   'wikipedia_url': '-'},
  {'name': 'Python',
   'type': 'ORGANIZATION',
   'salience': 0.007452790625393391,
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Python_(programming_language)'},
  {'name': 'organization',
   'type': 'ORGANIZATION',
   'salience': 0.0066571542993187904,
   'wikipedia_url': '-'},
  {'name': 'open source tools',
   'type': 'CONSUMER_GOOD',
   'salience': 0.006460968870669603,
   'wikipedia_url': '-'},
 