In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
source = requests.get('https://ec.europa.eu/programmes/horizon2020/en/h2020-section/information-and-communication-technologies').text

In [3]:
import lxml
soup = BeautifulSoup(source, 'lxml')

### Body_data

In [5]:
paragraphs = soup.find_all('p')
paragraphs

[<p>The potential and capabilities of modern ICT systems are still growing exponentially fuelled by the progress in electronics, microsystems, networking, the ability to master increasingly complex cyber-physical systems and robots, and progress in data processing and human machine interfaces. These developments provide major opportunities for Europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices, systems and applications can be implemented.</p>,
 <p>These new solutions will enable a wealth of new business developments in particular for SMEs, and will contribute to boosting competitiveness, creating jobs and supporting growth.</p>,
 <p>The first ICT-Leadership in Enabling and Industrial Technologies (LEIT) Work Programme under H2020 provides a balanced response to the main challenges faced by Europe in the field: firstly, the need to maintain a strong expertise in key technology value chains; secondly, the necessity to move quicker

In [6]:
my_list = []
for paragraph in paragraphs:
    para = paragraph.get_text()
    my_list.append(para)

In [7]:
para_data =  ' '.join([str(elem) for elem in my_list])
para_data

'The potential and capabilities of modern ICT systems are still growing exponentially fuelled by the progress in electronics, microsystems, networking, the ability to master increasingly complex cyber-physical systems and robots, and progress in data processing and human machine interfaces. These developments provide major opportunities for Europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices, systems and applications can be implemented. These new solutions will enable a wealth of new business developments in particular for SMEs, and will contribute to boosting competitiveness, creating jobs and supporting growth. The first ICT-Leadership in Enabling and Industrial\xa0Technologies (LEIT)\xa0Work Programme under H2020 provides a balanced response to the main challenges faced by Europe in the field: firstly, the need to maintain a strong expertise in key technology value chains; secondly, the necessity to move quicker from research 

### Sections

In [9]:
left_article = soup.find_all('span', class_ = 'field-content')
left_article

[<span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/photonics">Photonics</a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/micro-and-nanoelectronics">Micro- and Nanoelectronics </a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/content-technologies-and-information-management-0">Content Technologies and Information Management </a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/new-generation-components-and-systems">A new generation of components and systems</a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/advanced-computing">Advanced Computing </a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/future-internet">Future Internet</a></span>,
 <span class="field-content"><a href="/programmes/horizon2020/en/h2020-section/robotics">Robotics</a></span>]

In [10]:
for i in left_article:
    print(i.text)

Photonics
Micro- and Nanoelectronics 
Content Technologies and Information Management 
A new generation of components and systems
Advanced Computing 
Future Internet
Robotics


In [11]:
my_list = []
for i in left_article:
    article = i.string
    my_list.append(article)

In [13]:
contents = ','.join([str(elem) for elem in my_list])
contents

'Photonics,Micro- and Nanoelectronics ,Content Technologies and Information Management ,A new generation of components and systems,Advanced Computing ,Future Internet,Robotics'

**article_header**

In [19]:
article = soup.find('div', class_ = 'field-item even').text
article

'ICT underpins innovation and competitiveness across a broad range of private and public markets and sectors.'

**Title**

In [15]:
title = soup.find('h1').string

In [17]:
import re
title = re.sub('[^A-Za-z]',' ', title)

In [18]:
title = title.replace('   ', '')
title

' Information and Communication Technologies '

In [20]:
# Getting all data together
data = title + ' .' + article
data = data+ '.' +contents
final_data = data+ '.' +para_data
final_data

' Information and Communication Technologies  .ICT underpins innovation and competitiveness across a broad range of private and public markets and sectors..Photonics,Micro- and Nanoelectronics ,Content Technologies and Information Management ,A new generation of components and systems,Advanced Computing ,Future Internet,Robotics.The potential and capabilities of modern ICT systems are still growing exponentially fuelled by the progress in electronics, microsystems, networking, the ability to master increasingly complex cyber-physical systems and robots, and progress in data processing and human machine interfaces. These developments provide major opportunities for Europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices, systems and applications can be implemented. These new solutions will enable a wealth of new business developments in particular for SMEs, and will contribute to boosting competitiveness, creating jobs and supporting 

## Entity

In [21]:
import spacy 
from spacy import displacy

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
doc = nlp(final_data)

In [24]:
entities = []
labels = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)

In [25]:
df = pd.DataFrame({'Entities':entities,'Labels':labels})
df

Unnamed: 0,Entities,Labels
0,(Photonics),ORG
1,(Micro-),PERSON
2,(Nanoelectronics),ORG
3,"(Content, Technologies, and, Information, Mana...",ORG
4,"(Advanced, Computing)",PERSON
5,(ICT),ORG
6,(Europe),LOC
7,(first),ORDINAL
8,(ICT),ORG
9,(LEIT),ORG


## Key-Phrases

In [26]:
from collections import Counter
from string import punctuation

def key_phrases(text):
    result = []
    pos_tag = ['NOUN', 'ADJ', 'NOUN']
    doc = nlp(text.lower())
    
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
            
        if(token.pos_ in pos_tag):
            result.append(token.text)
            
    return result

In [27]:
Key_Phrases = key_phrases(final_data)

In [28]:
print(Key_Phrases)

['information', 'communication', 'technologies', 'innovation', 'competitiveness', 'broad', 'range', 'private', 'public', 'markets', 'sectors', 'photonics', 'micro-', 'nanoelectronics', 'content', 'technologies', 'information', 'management', 'new', 'generation', 'components', 'systems', 'advanced', 'computing', 'future', 'internet', 'potential', 'capabilities', 'modern', 'ict', 'systems', 'progress', 'electronics', 'microsystems', 'networking', 'ability', 'complex', 'cyber', 'physical', 'systems', 'robots', 'data', 'processing', 'human', 'machine', 'interfaces', 'developments', 'major', 'opportunities', 'generation', 'open', 'platforms', 'multiplicity', 'innovative', 'devices', 'systems', 'applications', 'new', 'solutions', 'wealth', 'new', 'business', 'developments', 'particular', 'smes', 'competitiveness', 'jobs', 'growth', 'ict', 'leadership', 'industrial', 'technologies', 'leit', 'work', 'programme', 'h2020', 'balanced', 'response', 'main', 'challenges', 'field', 'need', 'strong', '

## Sentiment Analysis

In [29]:
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

In [30]:
# Next, we initialize VADER so we can use it within our Python script
sid = SentimentIntensityAnalyzer()

In [31]:
# We will also initialize our 'english.pickle' function and give it a short name

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [32]:
message_text = final_data

In [33]:
#sentence tokenizing our data
sentences = tokenizer.tokenize(message_text)


In [34]:
for sentence in sentences:
        print(sentence)
        scores = sid.polarity_scores(sentence)
        for key in sorted(scores):
                print('{0}: {1}, '.format(key, scores[key]), end='')
        print()


 Information and Communication Technologies  .ICT underpins innovation and competitiveness across a broad range of private and public markets and sectors..Photonics,Micro- and Nanoelectronics ,Content Technologies and Information Management ,A new generation of components and systems,Advanced Computing ,Future Internet,Robotics.The potential and capabilities of modern ICT systems are still growing exponentially fuelled by the progress in electronics, microsystems, networking, the ability to master increasingly complex cyber-physical systems and robots, and progress in data processing and human machine interfaces.
compound: 0.8807, neg: 0.0, neu: 0.85, pos: 0.15, 
These developments provide major opportunities for Europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices, systems and applications can be implemented.
compound: 0.743, neg: 0.0, neu: 0.781, pos: 0.219, 
These new solutions will enable a wealth of new business developments 

## Thank You!!