In [1]:
import string
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
text = open('ICT.txt', encoding='cp1252').read()
lower_case = text.lower()
lower_case

'ict underpins innovation and competitiveness across a broad range of private and public markets and sectors.\nthe potential and capabilities of modern ict systems are still growing exponentially fuelled by the progress in electronics, microsystems, networking, the ability to master increasingly complex cyber-physical systems and robots, and progress in data processing and human machine interfaces. these developments provide major opportunities for europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices, systems and applications can be implemented.\n\nthese new solutions will enable a wealth of new business developments in particular for smes, and will contribute to boosting competitiveness, creating jobs and supporting growth.\n\nmain features of the work programme\nthe first ict-leadership in enabling and industrial technologies (leit) work programme under h2020 provides a balanced response to the main challenges faced by europe in

In [3]:
cleaned_text = lower_case.translate(str.maketrans('', '', string.punctuation))
cleaned_text

'ict underpins innovation and competitiveness across a broad range of private and public markets and sectors\nthe potential and capabilities of modern ict systems are still growing exponentially fuelled by the progress in electronics microsystems networking the ability to master increasingly complex cyberphysical systems and robots and progress in data processing and human machine interfaces these developments provide major opportunities for europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices systems and applications can be implemented\n\nthese new solutions will enable a wealth of new business developments in particular for smes and will contribute to boosting competitiveness creating jobs and supporting growth\n\nmain features of the work programme\nthe first ictleadership in enabling and industrial technologies leit work programme under h2020 provides a balanced response to the main challenges faced by europe in the field firs

In [4]:
tokenized_words = word_tokenize(cleaned_text, "english")
tokenized_words

['ict',
 'underpins',
 'innovation',
 'and',
 'competitiveness',
 'across',
 'a',
 'broad',
 'range',
 'of',
 'private',
 'and',
 'public',
 'markets',
 'and',
 'sectors',
 'the',
 'potential',
 'and',
 'capabilities',
 'of',
 'modern',
 'ict',
 'systems',
 'are',
 'still',
 'growing',
 'exponentially',
 'fuelled',
 'by',
 'the',
 'progress',
 'in',
 'electronics',
 'microsystems',
 'networking',
 'the',
 'ability',
 'to',
 'master',
 'increasingly',
 'complex',
 'cyberphysical',
 'systems',
 'and',
 'robots',
 'and',
 'progress',
 'in',
 'data',
 'processing',
 'and',
 'human',
 'machine',
 'interfaces',
 'these',
 'developments',
 'provide',
 'major',
 'opportunities',
 'for',
 'europe',
 'to',
 'develop',
 'the',
 'next',
 'generation',
 'of',
 'open',
 'platforms',
 'on',
 'top',
 'of',
 'which',
 'a',
 'multiplicity',
 'of',
 'innovative',
 'devices',
 'systems',
 'and',
 'applications',
 'can',
 'be',
 'implemented',
 'these',
 'new',
 'solutions',
 'will',
 'enable',
 'a',
 'wea

In [5]:
len(tokenized_words)

332

In [6]:
final_words = []
for word in tokenized_words:
    if word not in stopwords.words('english'):
        final_words.append(word)

In [7]:
len(final_words)

211

In [8]:
lemma_words = []

for word in final_words:
    word = WordNetLemmatizer().lemmatize(word)
    lemma_words.append(word)

In [9]:
lemma_words

['ict',
 'underpins',
 'innovation',
 'competitiveness',
 'across',
 'broad',
 'range',
 'private',
 'public',
 'market',
 'sector',
 'potential',
 'capability',
 'modern',
 'ict',
 'system',
 'still',
 'growing',
 'exponentially',
 'fuelled',
 'progress',
 'electronics',
 'microsystems',
 'networking',
 'ability',
 'master',
 'increasingly',
 'complex',
 'cyberphysical',
 'system',
 'robot',
 'progress',
 'data',
 'processing',
 'human',
 'machine',
 'interface',
 'development',
 'provide',
 'major',
 'opportunity',
 'europe',
 'develop',
 'next',
 'generation',
 'open',
 'platform',
 'top',
 'multiplicity',
 'innovative',
 'device',
 'system',
 'application',
 'implemented',
 'new',
 'solution',
 'enable',
 'wealth',
 'new',
 'business',
 'development',
 'particular',
 'smes',
 'contribute',
 'boosting',
 'competitiveness',
 'creating',
 'job',
 'supporting',
 'growth',
 'main',
 'feature',
 'work',
 'programme',
 'first',
 'ictleadership',
 'enabling',
 'industrial',
 'technology',


In [10]:
emotion_list = []
with open('Emotions.txt', 'r') as file:
    for line in file:
        clear_line = line.replace("\n", '').replace(",", '').replace("'", '').strip()
        word, emotion = clear_line.split(':')

        if word in lemma_words:
            emotion_list.append(emotion)
            
print("People emotions from the text \n", emotion_list, '\n \n')


w = Counter(emotion_list)
print("Count of each emotion \n", w)

People emotions from the text 
 [' attracted', ' independent'] 
 

Count of each emotion 
 Counter({' attracted': 1, ' independent': 1})


In [11]:
emotion_list

[' attracted', ' independent']

In [12]:
sia = SentimentIntensityAnalyzer()
sent = cleaned_text 
print (sia.polarity_scores(sent))

{'neg': 0.01, 'neu': 0.761, 'pos': 0.229, 'compound': 0.9972}


In [13]:
def sentiment_analyse(sentiment_text):
    
    score = SentimentIntensityAnalyzer().polarity_scores(sentiment_text)
    
    if score['neg'] > score['pos']:
        print("\n     ******Negative Sentiment*******")
        
    elif score['neg'] < score['pos']:
        print("\n     ******Positive Sentiment*******")
        
    else:
        print("Neutral Sentiment")

In [14]:
sentiment_analyse(cleaned_text)


     ******Positive Sentiment*******


# Entity Recognition

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import spacy
nlp = spacy.load("en_core_web_sm")

In [17]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
text_scrape = url_to_string('https://ec.europa.eu/programmes/horizon2020/en/h2020-section/information-and-communication-technologies')
article = nlp(text_scrape)
len(article.ents)

75

In [18]:
pos_tags = nltk.pos_tag(lemma_words)
pos_tags

[('ict', 'NN'),
 ('underpins', 'VBZ'),
 ('innovation', 'NN'),
 ('competitiveness', 'NN'),
 ('across', 'IN'),
 ('broad', 'JJ'),
 ('range', 'NN'),
 ('private', 'JJ'),
 ('public', 'JJ'),
 ('market', 'NN'),
 ('sector', 'NN'),
 ('potential', 'JJ'),
 ('capability', 'NN'),
 ('modern', 'JJ'),
 ('ict', 'NN'),
 ('system', 'NN'),
 ('still', 'RB'),
 ('growing', 'VBG'),
 ('exponentially', 'RB'),
 ('fuelled', 'VBN'),
 ('progress', 'NN'),
 ('electronics', 'NNS'),
 ('microsystems', 'VBP'),
 ('networking', 'VBG'),
 ('ability', 'NN'),
 ('master', 'NN'),
 ('increasingly', 'RB'),
 ('complex', 'JJ'),
 ('cyberphysical', 'JJ'),
 ('system', 'NN'),
 ('robot', 'JJ'),
 ('progress', 'NN'),
 ('data', 'NNS'),
 ('processing', 'NN'),
 ('human', 'JJ'),
 ('machine', 'NN'),
 ('interface', 'NN'),
 ('development', 'NN'),
 ('provide', 'IN'),
 ('major', 'JJ'),
 ('opportunity', 'NN'),
 ('europe', 'VBP'),
 ('develop', 'VB'),
 ('next', 'JJ'),
 ('generation', 'NN'),
 ('open', 'JJ'),
 ('platform', 'NN'),
 ('top', 'JJ'),
 ('multi

In [19]:
chunks = nltk.ne_chunk(pos_tags, binary=True)
for chunk in chunks:
    print(chunk)

('ict', 'NN')
('underpins', 'VBZ')
('innovation', 'NN')
('competitiveness', 'NN')
('across', 'IN')
('broad', 'JJ')
('range', 'NN')
('private', 'JJ')
('public', 'JJ')
('market', 'NN')
('sector', 'NN')
('potential', 'JJ')
('capability', 'NN')
('modern', 'JJ')
('ict', 'NN')
('system', 'NN')
('still', 'RB')
('growing', 'VBG')
('exponentially', 'RB')
('fuelled', 'VBN')
('progress', 'NN')
('electronics', 'NNS')
('microsystems', 'VBP')
('networking', 'VBG')
('ability', 'NN')
('master', 'NN')
('increasingly', 'RB')
('complex', 'JJ')
('cyberphysical', 'JJ')
('system', 'NN')
('robot', 'JJ')
('progress', 'NN')
('data', 'NNS')
('processing', 'NN')
('human', 'JJ')
('machine', 'NN')
('interface', 'NN')
('development', 'NN')
('provide', 'IN')
('major', 'JJ')
('opportunity', 'NN')
('europe', 'VBP')
('develop', 'VB')
('next', 'JJ')
('generation', 'NN')
('open', 'JJ')
('platform', 'NN')
('top', 'JJ')
('multiplicity', 'NN')
('innovative', 'JJ')
('device', 'NN')
('system', 'NN')
('application', 'NN')
('im

In [20]:
t = text.replace('\n',' ').replace('.',' ').replace(',',' ').replace('(','').replace(')','').replace('-',' ')
t

'ICT underpins innovation and competitiveness across a broad range of private and public markets and sectors  The potential and capabilities of modern ICT systems are still growing exponentially fuelled by the progress in electronics  microsystems  networking  the ability to master increasingly complex cyber physical systems and robots  and progress in data processing and human machine interfaces  These developments provide major opportunities for Europe to develop the next generation of open platforms on top of which a multiplicity of innovative devices  systems and applications can be implemented   These new solutions will enable a wealth of new business developments in particular for SMEs  and will contribute to boosting competitiveness  creating jobs and supporting growth   Main features of the Work Programme The first ICT Leadership in Enabling and Industrial Technologies LEIT Work Programme under H2020 provides a balanced response to the main challenges faced by Europe in the fie

In [21]:
doc = nlp(t)
entity =[]
label =[]
for ent in doc.ents:
    label.append(ent.label_)
    entity.append(ent)

In [22]:
df = pd.DataFrame({'Entity':entity,'Label':label})
df

Unnamed: 0,Entity,Label
0,(ICT),ORG
1,(ICT),ORG
2,(Europe),LOC
3,"(the, Work, Programme)",LAW
4,(first),ORDINAL
5,(Europe),LOC
6,(firstly),ORDINAL
7,(secondly),ORDINAL
8,(Six),CARDINAL
9,"(ICT, LEIT)",ORG


In [23]:
cleaned_text1 = text_scrape.translate(str.maketrans('', '', string.punctuation))
cleaned_text1

'      Information and Communication Technologies  Horizon 2020                     Skip to main content                                                                                                               Log in                                                                                                                  European Commission                                                                              Search this website                                                                                     European CommissionFunding TendersFunding opportunitiesFunding programmesHorizon 2020                                 Horizon 2020                                               Main menu            What is Horizon 2020 Find Your area How to get funding News Events Publications Projects The next Framework Programme                                      \xa0                                        Sections navigation                                                

In [24]:
t1 = cleaned_text1.replace('\n',' ').replace('.',' ').replace(',',' ').replace('(','').replace(')','').replace('-',' ')

In [25]:
doc = nlp(t1)
entity =[]
label =[]
for ent in doc.ents:
    label.append(ent.label_)
    entity.append(ent)

In [26]:
df1 = pd.DataFrame({'Entity':entity,'Label':label})
df1

Unnamed: 0,Entity,Label
0,(European),NORP
1,(2020),DATE
2,"(News, Events, Publications, Projects)",ORG
3,"(Framework, Programme, ...",ORG
4,"(Nanoelectronics, , Content, T...",ORG
...,...,...
65,"(Facebook, Twitter)",PERSON
66,"(European, Union)",ORG
67,(EU),ORG
68,"(European, Union)",ORG


In [27]:
df1.to_csv('Entity 1.csv', index = False)