In [2]:
# Initial imports
import os
import pandas as pd
from datetime import datetime, timedelta
from pathlib import Path
from newsapi.newsapi_client import NewsApiClient
from dotenv import load_dotenv
from nltk.corpus import stopwords, reuters
from nltk.tokenize import sent_tokenize, word_tokenize
load_dotenv()
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
api_key = os.getenv("news_api")

In [4]:
newsapi = NewsApiClient(api_key=api_key)

In [51]:
tech_news = newsapi.get_everything(q = "tesla & inflation", language="en", 
page_size=100, sort_by="relevancy")
print(tech_news["articles"][0])


{'source': {'id': None, 'name': 'New York Times'}, 'author': 'The New York Times', 'title': 'Here is where inflation stands as the Fed meets.', 'description': 'A jump in consumer prices is sure to come up at a news conference on Wednesday after the Federal Reserve’s meeting.', 'url': 'https://www.nytimes.com/live/2021/06/16/business/economy-stock-market-news', 'urlToImage': 'https://static01.nyt.com/images/2021/06/16/business/16economy-briefing-FedInflation/merlin_188586378_b7a0c048-2add-4403-b30c-357399ab4cea-facebookJumbo.jpg', 'publishedAt': '2021-06-16T10:21:32Z', 'content': 'A day after ousting two top executives, the electric truck start-up Lordstown Motors said on Tuesday that it was on track to start production in September even if it does not raise additional funding… [+3149 chars]'}


In [23]:
print(tech_news["totalResults"])

478


In [24]:
#print(f"Total articles about Tesla and Inflation: {inflation_news['totalResults']}")
#tech_news["articles"][0]

In [25]:
article_new = tech_news["articles"][0]

In [26]:
print(article_new)

{'source': {'id': None, 'name': 'New York Times'}, 'author': 'The New York Times', 'title': 'Here is where inflation stands as the Fed meets.', 'description': 'A jump in consumer prices is sure to come up at a news conference on Wednesday after the Federal Reserve’s meeting.', 'url': 'https://www.nytimes.com/live/2021/06/16/business/economy-stock-market-news', 'urlToImage': 'https://static01.nyt.com/images/2021/06/16/business/16economy-briefing-FedInflation/merlin_188586378_b7a0c048-2add-4403-b30c-357399ab4cea-facebookJumbo.jpg', 'publishedAt': '2021-06-16T10:21:32Z', 'content': 'A day after ousting two top executives, the electric truck start-up Lordstown Motors said on Tuesday that it was on track to start production in September even if it does not raise additional funding… [+3149 chars]'}


In [35]:
# In order to tokenize properly, all text needs to be strings, not integers or floats
article_letters_only = re.sub("[^a-zA-Z]", " ", str(article_new))
article_letters_only

'  source     id   None   name    New York Times     author    The New York Times    title    Here is where inflation stands as the Fed meets     description    A jump in consumer prices is sure to come up at a news conference on Wednesday after the Federal Reserve s meeting     url    https   www nytimes com live            business economy stock market news    urlToImage    https   static   nyt com images            business   economy briefing FedInflation merlin           b a c     add      b  c       ab cea facebookJumbo jpg    publishedAt              T        Z    content    A day after ousting two top executives  the electric truck start up Lordstown Motors said on Tuesday that it was on track to start production in September even if it does not raise additional funding         chars   '

In [36]:
# First, we tokenize sentences
sent_sample = sent_tokenize(article_letters_only)
print(sent_sample)

['  source     id   None   name    New York Times     author    The New York Times    title    Here is where inflation stands as the Fed meets     description    A jump in consumer prices is sure to come up at a news conference on Wednesday after the Federal Reserve s meeting     url    https   www nytimes com live            business economy stock market news    urlToImage    https   static   nyt com images            business   economy briefing FedInflation merlin           b a c     add      b  c       ab cea facebookJumbo jpg    publishedAt              T        Z    content    A day after ousting two top executives  the electric truck start up Lordstown Motors said on Tuesday that it was on track to start production in September even if it does not raise additional funding         chars']


In [37]:
# Second, we tokenize words within the sentences
word_sample = word_tokenize(article_letters_only)
print(word_sample)

['source', 'id', 'None', 'name', 'New', 'York', 'Times', 'author', 'The', 'New', 'York', 'Times', 'title', 'Here', 'is', 'where', 'inflation', 'stands', 'as', 'the', 'Fed', 'meets', 'description', 'A', 'jump', 'in', 'consumer', 'prices', 'is', 'sure', 'to', 'come', 'up', 'at', 'a', 'news', 'conference', 'on', 'Wednesday', 'after', 'the', 'Federal', 'Reserve', 's', 'meeting', 'url', 'https', 'www', 'nytimes', 'com', 'live', 'business', 'economy', 'stock', 'market', 'news', 'urlToImage', 'https', 'static', 'nyt', 'com', 'images', 'business', 'economy', 'briefing', 'FedInflation', 'merlin', 'b', 'a', 'c', 'add', 'b', 'c', 'ab', 'cea', 'facebookJumbo', 'jpg', 'publishedAt', 'T', 'Z', 'content', 'A', 'day', 'after', 'ousting', 'two', 'top', 'executives', 'the', 'electric', 'truck', 'start', 'up', 'Lordstown', 'Motors', 'said', 'on', 'Tuesday', 'that', 'it', 'was', 'on', 'track', 'to', 'start', 'production', 'in', 'September', 'even', 'if', 'it', 'does', 'not', 'raise', 'additional', 'fundin

In [38]:
# Third, we get rid of stopwords that are not useful for proper tokenization. Here is a list.
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [39]:
# Fourth, we apply the stopwords function using the above stopwords list
sw_list = set(stopwords.words('english'))
sw_result = [word.lower() for word in word_sample if word.lower() not in sw_list]
print(sw_result)


['source', 'id', 'none', 'name', 'new', 'york', 'times', 'author', 'new', 'york', 'times', 'title', 'inflation', 'stands', 'fed', 'meets', 'description', 'jump', 'consumer', 'prices', 'sure', 'come', 'news', 'conference', 'wednesday', 'federal', 'reserve', 'meeting', 'url', 'https', 'www', 'nytimes', 'com', 'live', 'business', 'economy', 'stock', 'market', 'news', 'urltoimage', 'https', 'static', 'nyt', 'com', 'images', 'business', 'economy', 'briefing', 'fedinflation', 'merlin', 'b', 'c', 'add', 'b', 'c', 'ab', 'cea', 'facebookjumbo', 'jpg', 'publishedat', 'z', 'content', 'day', 'ousting', 'two', 'top', 'executives', 'electric', 'truck', 'start', 'lordstown', 'motors', 'said', 'tuesday', 'track', 'start', 'production', 'september', 'even', 'raise', 'additional', 'funding', 'chars']


In [44]:
# Adding customized stopwords taken from the first result
sw_custom = {'https', 'z', 'e', 'url', 'www', 'b', 'c'}
custom_result = [word.lower() for word in word_sample if word.lower() not in sw_list.union(sw_custom)]

In [45]:
print(custom_result)

['source', 'id', 'none', 'name', 'new', 'york', 'times', 'author', 'new', 'york', 'times', 'title', 'inflation', 'stands', 'fed', 'meets', 'description', 'jump', 'consumer', 'prices', 'sure', 'come', 'news', 'conference', 'wednesday', 'federal', 'reserve', 'meeting', 'nytimes', 'com', 'live', 'business', 'economy', 'stock', 'market', 'news', 'urltoimage', 'static', 'nyt', 'com', 'images', 'business', 'economy', 'briefing', 'fedinflation', 'merlin', 'add', 'ab', 'cea', 'facebookjumbo', 'jpg', 'publishedat', 'content', 'day', 'ousting', 'two', 'top', 'executives', 'electric', 'truck', 'start', 'lordstown', 'motors', 'said', 'tuesday', 'track', 'start', 'production', 'september', 'even', 'raise', 'additional', 'funding', 'chars']


In [47]:
# Activate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [48]:
next_result = [lemmatizer.lemmatize(word) for word in word_sample]
print(next_result)

['source', 'id', 'None', 'name', 'New', 'York', 'Times', 'author', 'The', 'New', 'York', 'Times', 'title', 'Here', 'is', 'where', 'inflation', 'stand', 'a', 'the', 'Fed', 'meet', 'description', 'A', 'jump', 'in', 'consumer', 'price', 'is', 'sure', 'to', 'come', 'up', 'at', 'a', 'news', 'conference', 'on', 'Wednesday', 'after', 'the', 'Federal', 'Reserve', 's', 'meeting', 'url', 'http', 'www', 'nytimes', 'com', 'live', 'business', 'economy', 'stock', 'market', 'news', 'urlToImage', 'http', 'static', 'nyt', 'com', 'image', 'business', 'economy', 'briefing', 'FedInflation', 'merlin', 'b', 'a', 'c', 'add', 'b', 'c', 'ab', 'cea', 'facebookJumbo', 'jpg', 'publishedAt', 'T', 'Z', 'content', 'A', 'day', 'after', 'ousting', 'two', 'top', 'executive', 'the', 'electric', 'truck', 'start', 'up', 'Lordstown', 'Motors', 'said', 'on', 'Tuesday', 'that', 'it', 'wa', 'on', 'track', 'to', 'start', 'production', 'in', 'September', 'even', 'if', 'it', 'doe', 'not', 'raise', 'additional', 'funding', 'char'

stopped here. Need to go back and push to github as proof that we did homework for 1st section of NLP