In [90]:
# Initial imports
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from newsapi.newsapi_client import NewsApiClient
from dotenv import load_dotenv
from nltk.corpus import stopwords, reuters
from nltk.tokenize import sent_tokenize, word_tokenize
load_dotenv()
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [91]:
api_key = os.getenv("news_api")

In [92]:
newsapi = NewsApiClient(api_key=api_key)

In [93]:
tech_news = newsapi.get_everything(q = "tesla & inflation", language="en", 
page_size=100, sort_by="relevancy")
print(tech_news["articles"][0])


{'source': {'id': None, 'name': 'New York Times'}, 'author': 'The New York Times', 'title': 'Here is where inflation stands as the Fed meets.', 'description': 'A jump in consumer prices is sure to come up at a news conference on Wednesday after the Federal Reserve’s meeting.', 'url': 'https://www.nytimes.com/live/2021/06/16/business/economy-stock-market-news', 'urlToImage': 'https://static01.nyt.com/images/2021/06/16/business/16economy-briefing-FedInflation/merlin_188586378_b7a0c048-2add-4403-b30c-357399ab4cea-facebookJumbo.jpg', 'publishedAt': '2021-06-16T10:21:32Z', 'content': 'A day after ousting two top executives, the electric truck start-up Lordstown Motors said on Tuesday that it was on track to start production in September even if it does not raise additional funding… [+3149 chars]'}


In [94]:
print(tech_news["totalResults"])

479


In [95]:
#print(f"Total articles about Tesla and Inflation: {inflation_news['totalResults']}")
#tech_news["articles"][0]

In [96]:
article_new = tech_news["articles"]

In [97]:
# In order to tokenize properly, all text needs to be strings, not integers or floats
article_letters_only = re.sub("[^a-zA-Z]", " ", str(article_new))


In [98]:
# First, we tokenize sentences
sent_sample = sent_tokenize(article_letters_only)

In [99]:
# Second, we tokenize words within the sentences
word_sample = word_tokenize(article_letters_only)


In [100]:
# Third, we get rid of stopwords that are not useful for proper tokenization. Here is a list.
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [101]:
# Fourth, we apply the stopwords function using the above stopwords list
sw_list = set(stopwords.words('english'))
sw_result = [word.lower() for word in word_sample if word.lower() not in sw_list]



In [102]:
# Adding customized stopwords taken from the first result
sw_custom = {'https', 'com', 'http', 'z', 'e', 'url', 'www', 'b', 'c', 'char'}
custom_result = [word.lower() for word in word_sample if word.lower() not in sw_list.union(sw_custom)]


In [103]:
# Activate the lemmatizer
lemmatizer = WordNetLemmatizer()

In [104]:
next_result = [lemmatizer.lemmatize(word) for word in word_sample]
# print(next_result)

stopped here. Need to go back and push to github as proof that we did homework for 1st section of NLP

In [122]:
# Calculating TF-IDF for the working corpus.
vectorizer = TfidfVectorizer(stop_words="english")
X_corpus = vectorizer.fit_transform(next_result)


In [125]:
# Getting matrix info
print(f"Matrix shape: {X_corpus.shape}")
print(f"Total number of documents: {X_corpus.shape[0]}")
print(f"Total number of unique words (tokens): {X_corpus.shape[1]}")


Matrix shape: (12944, 1798)
Total number of documents: 12944
Total number of unique words (tokens): 1798


In [126]:
# Retrieve words list from corpus
words_corpus = vectorizer.get_feature_names()
# print(words_corpus)

In [127]:
# Getting the TF-IDF weight of each word in corpus as DataFrame
words_corpus_df = pd.DataFrame(
    list(zip(words_corpus, np.ravel(X_corpus.mean(axis=0)))), columns=["Word", "TF-IDF"]
)

words_corpus_df = words_corpus_df.sort_values(by=["TF-IDF"], ascending=False)

In [129]:
words_corpus_df.head(20)

Unnamed: 0,Word,TF-IDF
1317,reuters,0.018541
293,com,0.017383
728,http,0.016224
1507,stock,0.012129
1766,www,0.008962
949,market,0.008884
314,content,0.008498
100,author,0.007957
1230,publishedat,0.007726
256,char,0.007726


In [130]:
# Lowest 20 TF-IDF scores
words_corpus_df.tail(20)

Unnamed: 0,Word,TF-IDF
1138,pddcmqvppsi,7.7e-05
1139,pdoucamophyph,7.7e-05
525,fame,7.7e-05
1144,perf,7.7e-05
521,fallacy,7.7e-05
1146,performing,7.7e-05
1149,person,7.7e-05
1165,pirekaefrn,7.7e-05
517,faf,7.7e-05
1151,phillip,7.7e-05


In [131]:
# Creating a DataFrame Representation of the TF-IDF results
next_result_df = pd.DataFrame(
    list(zip(vectorizer.get_feature_names(), np.ravel(X.sum(axis=0)))),
    columns=["Word", "Frequency"],
)

# Order the DataFrame by word frequency in descending order
next_result_df = next_result_df.sort_values(by=["Frequency"], ascending=False)

# Print the top 10 words
next_result_df.head(10)


Unnamed: 0,Word,Frequency
1317,reuters,240.0
293,com,225.0
728,http,210.0
1507,stock,157.0
1766,www,116.0
949,market,115.0
314,content,110.0
100,author,103.0
1230,publishedat,100.0
256,char,100.0


In [132]:
# Top words will be those with a frequency between 10 ans 30 (thumb rule)
top_words = next_result_df[
    (next_result_df["Frequency"] >= 10) & (next_result_df["Frequency"] <= 30)
]

top_words.head(10)

Unnamed: 0,Word,Frequency
408,dow,30.0
891,li,30.0
31,ahead,29.0
355,day,29.0
549,filter,28.0
436,east,28.0
696,higher,27.0
1032,net,27.0
659,ha,27.0
1519,street,26.0


In [133]:
# Top words will be those with a frequency between 10 ans 30 (thumb rule)
top_words = next_result_df[
    (next_result_df["Frequency"] >= 10) & (next_result_df["Frequency"] <= 30)
]

top_words.head(10)


Unnamed: 0,Word,Frequency
408,dow,30.0
891,li,30.0
31,ahead,29.0
355,day,29.0
549,filter,28.0
436,east,28.0
696,higher,27.0
1032,net,27.0
659,ha,27.0
1519,street,26.0
