# 1. Use urllib or requests package to read this CNBC article through URL link
# 2. Use BeautifulSoup or another HTML parsing package to extract text from the article

In [5]:
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib import request

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

html = request.urlopen('https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html').read()

text = text_from_html(html)

In [6]:
print(text)

× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

# 3. Use re (regular expression) package to:

#### Find all matches of dollar sign amounts in the article

In [7]:
import re

In [8]:
#using findall function
re.findall('\$([\d]+)',text)

['325', '351']

In [9]:
#using finditer function
[x.group() for x in re.finditer('\$([\d]+)',text)]

['$325', '$351']

#### Substitute all numbers with # character and print the output

In [10]:
print(re.sub(r'[0-9]','#',text))

× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

#### Count (using regular expressions) ”Netflix” and “Disney” mentions 

In [11]:
#count ”Netflix” mentions
len(re.findall(r'Netflix',text))

13

In [12]:
#count ”Disney” mentions 
len(re.findall(r'Disney',text))

7

# 4. Use NTLK and/or Spacy tokenization features to:

#### Tokenize sentences and words

In [13]:
from nltk import word_tokenize, sent_tokenize

In [14]:
#tokenize sentences
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

× LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [15]:
#tokenize words
tokens = word_tokenize(text)
for token in tokens:
    print(token)

×
LOG
IN
SIGN
UP
Keep
Me
Logged
In
Skip
Navigation
SIGN
IN
Pro
Watchlist
Make
It
Select
USA
INTL
Markets
Pre-Markets
U.S.
Markets
Currencies
Cryptocurrency
Futures
&
Commodities
Bonds
Funds
&
ETFs
Watchlist
Business
Economy
Finance
Health
&
Science
Media
Real
Estate
Energy
Transportation
Industrials
Retail
Wealth
Small
Business
Investing
Invest
In
You
Personal
Finance
Financial
Advisors
Trading
Nation
Options
Action
ETF
Street
Buffett
Archive
Earnings
Trader
Talk
Tech
Cybersecurity
Enterprise
Internet
Media
Mobile
Social
Media
Venture
Capital
Tech
Guide
Politics
White
House
Policy
Defense
Congress
2020
Elections
CNBC
TV
Live
TV
Live
Audio
Latest
Video
Top
Video
CEO
Interviews
Business
Day
Shows
Primetime
Shows
CNBC
World
Digital
Originals
Full
Episodes
Menu
SEARCH
QUOTES
Markets
Pre-Markets
U.S.
Markets
Currencies
Cryptocurrency
Futures
&
Commodities
Bonds
Funds
&
ETFs
Watchlist
Business
Economy
Finance
Health
&
Science
Media
Real
Estate
Energy
Transportation
Industrials
Retail
Wealth


#### Remove all English stop words

In [16]:
from nltk.corpus import stopwords

filtered_token = []

stopwords = set(stopwords.words('english'))

for token in tokens:
    if token not in stopwords:
        filtered_token.append(token)

In [17]:
print(len(tokens))
print(len(filtered_token))

1082
848


#### List and count n-grams for any given input n

In [18]:
# list n-grams
from collections import Counter
from nltk import ngrams
n = int(input("n = ")) #i.e. n = 3
n_grams = Counter(ngrams(tokens,n))
print(len(n_grams))
print(n_grams)

n = 3
962
Counter({('Markets', 'Pre-Markets', 'U.S.'): 2, ('Pre-Markets', 'U.S.', 'Markets'): 2, ('U.S.', 'Markets', 'Currencies'): 2, ('Markets', 'Currencies', 'Cryptocurrency'): 2, ('Currencies', 'Cryptocurrency', 'Futures'): 2, ('Cryptocurrency', 'Futures', '&'): 2, ('Futures', '&', 'Commodities'): 2, ('&', 'Commodities', 'Bonds'): 2, ('Commodities', 'Bonds', 'Funds'): 2, ('Bonds', 'Funds', '&'): 2, ('Funds', '&', 'ETFs'): 2, ('&', 'ETFs', 'Watchlist'): 2, ('ETFs', 'Watchlist', 'Business'): 2, ('Watchlist', 'Business', 'Economy'): 2, ('Business', 'Economy', 'Finance'): 2, ('Economy', 'Finance', 'Health'): 2, ('Finance', 'Health', '&'): 2, ('Health', '&', 'Science'): 2, ('&', 'Science', 'Media'): 2, ('Science', 'Media', 'Real'): 2, ('Media', 'Real', 'Estate'): 2, ('Real', 'Estate', 'Energy'): 2, ('Estate', 'Energy', 'Transportation'): 2, ('Energy', 'Transportation', 'Industrials'): 2, ('Transportation', 'Industrials', 'Retail'): 2, ('Industrials', 'Retail', 'Wealth'): 2, ('Retail', '

#### Lemmatize and deduplicate unigrams into a vocabulary of terms.

In [19]:
#lemmatize unigrams

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lem_unigrams = []

for token in tokens:
    lem_unigram = wordnet_lemmatizer.lemmatize(token.lower())
    lem_unigrams.append(lem_unigram)

In [23]:
vocabulary_lem = Counter(lem_unigrams)
print(vocabulary_lem)

Counter({',': 37, '.': 31, 'a': 24, 'the': 24, 'it': 20, "'s": 16, 'to': 16, 'and': 14, 'in': 13, 'cnbc': 13, 'netflix': 13, 'of': 13, '``': 12, "''": 12, 'is': 11, 'that': 11, '&': 8, 'business': 8, 'video': 8, 'price': 8, 'market': 7, 'medium': 7, 'more': 7, 'trainer': 7, 'on': 7, 'for': 7, 'original': 6, 'disney': 6, 'streaming': 6, 'content': 6, 'up': 5, 'you': 5, 'tech': 5, 'tv': 5, 'service': 5, 'said': 5, 'percent': 5, 'subscriber': 5, 'data': 5, 'sign': 4, 'u.s.': 4, 'etf': 4, 'finance': 4, 'invest': 4, 'capital': 4, '2020': 4, 'live': 4, 'show': 4, "'": 4, '2019': 4, 'new': 4, 'construct': 4, 'anthony': 4, 'which': 4, 'he': 4, 'company': 4, 'at': 4, 'from': 4, 'they': 4, 'can': 4, 'with': 4, 'news': 4, 'watchlist': 3, 'make': 3, 'future': 3, 'personal': 3, 'financial': 3, 'trading': 3, 'street': 3, 'earnings': 3, 'policy': 3, 'ceo': 3, 'day': 3, 'digital': 3, 'key': 3, 'dilemma': 3, 'david': 3, 'say': 3, 'aegis': 3, 'increase': 3, 'bell': 3, 'have': 3, 'when': 3, 'this': 3, 'g

In [26]:
#apply stemming to lematized tokens
#not required in the questions, but required in assessment details by "Stem and lemmatize work tokens (20 pts)"
#just for reference
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

stem_lem_unigrams = []

for lem_unigram in lem_unigrams:
    stem_lem_unigram = porter_stemmer.stem(lem_unigram)
    stem_lem_unigrams.append(stem_lem_unigram)

In [27]:
vocabulary_stem_lem = Counter(stem_lem_unigrams)
print(vocabulary_stem_lem)

Counter({',': 37, '.': 31, 'a': 24, 'the': 24, 'it': 20, "'s": 16, 'to': 16, 'and': 14, 'in': 13, 'cnbc': 13, 'netflix': 13, 'of': 13, '``': 12, "''": 12, 'is': 11, 'that': 11, '&': 8, 'busi': 8, 'video': 8, 'price': 8, 'market': 7, 'medium': 7, 'invest': 7, 'more': 7, 'trainer': 7, 'on': 7, 'for': 7, 'origin': 6, 'disney': 6, 'stream': 6, 'content': 6, 'subscrib': 6, 'up': 5, 'you': 5, 'tech': 5, 'tv': 5, 'servic': 5, 'said': 5, 'percent': 5, 'data': 5, 'sign': 4, 'u.s.': 4, 'etf': 4, 'financ': 4, 'capit': 4, '2020': 4, 'live': 4, 'show': 4, "'": 4, '2019': 4, 'new': 4, 'construct': 4, 'anthoni': 4, 'increas': 4, 'which': 4, 'he': 4, 'compani': 4, 'at': 4, 'from': 4, 'they': 4, 'can': 4, 'with': 4, 'news': 4, 'watchlist': 3, 'make': 3, 'futur': 3, 'person': 3, 'financi': 3, 'trade': 3, 'street': 3, 'earn': 3, 'polici': 3, 'defens': 3, 'ceo': 3, 'day': 3, 'digit': 3, 'key': 3, 'dilemma': 3, 'david': 3, 'say': 3, 'aegi': 3, 'close': 3, 'bell': 3, 'have': 3, 'when': 3, 'thi': 3, 'got': 3

#### Print bigrams and trigrams in the first 5 sentences

In [28]:
first5sentences = ''.join(sentences[0:5])

In [29]:
from nltk import word_tokenize, ngrams
tokens_5 = word_tokenize(first5sentences)

In [30]:
# print bigrams in the first 5 sentences
for i in ngrams(tokens_5, 2):
    print(i)

('×', 'LOG')
('LOG', 'IN')
('IN', 'SIGN')
('SIGN', 'UP')
('UP', 'Keep')
('Keep', 'Me')
('Me', 'Logged')
('Logged', 'In')
('In', 'Skip')
('Skip', 'Navigation')
('Navigation', 'SIGN')
('SIGN', 'IN')
('IN', 'Pro')
('Pro', 'Watchlist')
('Watchlist', 'Make')
('Make', 'It')
('It', 'Select')
('Select', 'USA')
('USA', 'INTL')
('INTL', 'Markets')
('Markets', 'Pre-Markets')
('Pre-Markets', 'U.S.')
('U.S.', 'Markets')
('Markets', 'Currencies')
('Currencies', 'Cryptocurrency')
('Cryptocurrency', 'Futures')
('Futures', '&')
('&', 'Commodities')
('Commodities', 'Bonds')
('Bonds', 'Funds')
('Funds', '&')
('&', 'ETFs')
('ETFs', 'Watchlist')
('Watchlist', 'Business')
('Business', 'Economy')
('Economy', 'Finance')
('Finance', 'Health')
('Health', '&')
('&', 'Science')
('Science', 'Media')
('Media', 'Real')
('Real', 'Estate')
('Estate', 'Energy')
('Energy', 'Transportation')
('Transportation', 'Industrials')
('Industrials', 'Retail')
('Retail', 'Wealth')
('Wealth', 'Small')
('Small', 'Business')
('Busine

In [31]:
# print trigrams in the first 5 sentences
for i in ngrams(tokens_5, 3):
    print(i)

('×', 'LOG', 'IN')
('LOG', 'IN', 'SIGN')
('IN', 'SIGN', 'UP')
('SIGN', 'UP', 'Keep')
('UP', 'Keep', 'Me')
('Keep', 'Me', 'Logged')
('Me', 'Logged', 'In')
('Logged', 'In', 'Skip')
('In', 'Skip', 'Navigation')
('Skip', 'Navigation', 'SIGN')
('Navigation', 'SIGN', 'IN')
('SIGN', 'IN', 'Pro')
('IN', 'Pro', 'Watchlist')
('Pro', 'Watchlist', 'Make')
('Watchlist', 'Make', 'It')
('Make', 'It', 'Select')
('It', 'Select', 'USA')
('Select', 'USA', 'INTL')
('USA', 'INTL', 'Markets')
('INTL', 'Markets', 'Pre-Markets')
('Markets', 'Pre-Markets', 'U.S.')
('Pre-Markets', 'U.S.', 'Markets')
('U.S.', 'Markets', 'Currencies')
('Markets', 'Currencies', 'Cryptocurrency')
('Currencies', 'Cryptocurrency', 'Futures')
('Cryptocurrency', 'Futures', '&')
('Futures', '&', 'Commodities')
('&', 'Commodities', 'Bonds')
('Commodities', 'Bonds', 'Funds')
('Bonds', 'Funds', '&')
('Funds', '&', 'ETFs')
('&', 'ETFs', 'Watchlist')
('ETFs', 'Watchlist', 'Business')
('Watchlist', 'Business', 'Economy')
('Business', 'Economy

#### Print POS tags in the first 5 sentences

In [32]:
from nltk import pos_tag
sentence_pos = pos_tag(tokens_5)
print(sentence_pos)

[('×', 'JJ'), ('LOG', 'NNP'), ('IN', 'NNP'), ('SIGN', 'NNP'), ('UP', 'NNP'), ('Keep', 'NNP'), ('Me', 'NNP'), ('Logged', 'NNP'), ('In', 'IN'), ('Skip', 'NNP'), ('Navigation', 'NNP'), ('SIGN', 'NNP'), ('IN', 'NNP'), ('Pro', 'NNP'), ('Watchlist', 'NNP'), ('Make', 'NNP'), ('It', 'PRP'), ('Select', 'NNP'), ('USA', 'NNP'), ('INTL', 'NNP'), ('Markets', 'NNP'), ('Pre-Markets', 'NNP'), ('U.S.', 'NNP'), ('Markets', 'NNP'), ('Currencies', 'NNP'), ('Cryptocurrency', 'NNP'), ('Futures', 'NNP'), ('&', 'CC'), ('Commodities', 'NNP'), ('Bonds', 'NNP'), ('Funds', 'NNP'), ('&', 'CC'), ('ETFs', 'NNP'), ('Watchlist', 'NNP'), ('Business', 'NNP'), ('Economy', 'NNP'), ('Finance', 'NNP'), ('Health', 'NNP'), ('&', 'CC'), ('Science', 'NNP'), ('Media', 'NNP'), ('Real', 'NNP'), ('Estate', 'NNP'), ('Energy', 'NNP'), ('Transportation', 'NNP'), ('Industrials', 'NNP'), ('Retail', 'NNP'), ('Wealth', 'NNP'), ('Small', 'NNP'), ('Business', 'NNP'), ('Investing', 'NNP'), ('Invest', 'NNP'), ('In', 'IN'), ('You', 'PRP'), ('P