In [None]:
!pip install newsapi-python

In [None]:
import os
import pandas as pd
from newsapi import NewsApiClient
from dotenv import load_dotenv
load_dotenv()

In [None]:
# nltk and vader imports

import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
%matplotlib inline

In [None]:
api_key = os.getenv("news_api")
#api_key = os.getenv(news_api)
api_key
print(api_key)

In [None]:
newsapi = NewsApiClient(api_key=api_key)

In [None]:
stock_x_headlines = newsapi.get_top_headlines(q="stock name", language="en", country="us")
stock_x_headlines 

In [None]:
# Print total articles
print(f"Total articles about stock x: {stock_x_headlines['totalResults']}")

# Show sample article
stock_x["articles"][0]

In [None]:
# Transform the response dictionary to a DataFrame
stock_x_df = pd.DataFrame.from_dict(stock_x_headlines["articles"])

stock_x_df.head()

In [None]:
# Create the stock_x sentiment scores DataFrame
stock_x_sentiments = []

for article in stock_x_news["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        bitcoin_sentiments.append({
            "text":text,
            "date":date,
            "compound":compound,
            "positive":pos,
            "neutral":neu,
            "negative":neg
        })
    except AttributeError:
        pass
    
bitcoin_df = pd.DataFrame(bitcoin_sentiments)
cols = ["date", "text", "compound", "positive", "neutral", "negative"]
stock_x_df = stock_x_df[cols]
stock_x_df.head()

In [None]:
# Describe the stock_x Sentiment

stock_x_df.describe()

In [None]:
stock_x_df["positive"].mean()

In [None]:
stock_x_df["compound"].max()

In [None]:
stock_x_df["compound"].min()

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [None]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords

sw = set(stopwords.words('english'))


# Expand the default stopwords list if necessary

sw_addon={'char', 'reuters', 'ha', 'cryptocurrency', 'million','…'}


sw=sw.union(sw_addon)



In [None]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
   
    # Remove the punctuation from text
    for word in text:
        if word in punctuation:
            text = text.replace(word, "")
   
    # Create a tokenized list of the words
    words = word_tokenize(text)   
    
    # Lemmatize words into root words
    lem = [lemmatizer.lemmatize(word) for word in words]
   
    # Convert the words to lowercase & Remove the stopwords
    tokens = [word.lower() for word in lem if word.lower() not in sw]    
    
    return tokens

In [None]:
# Create a new tokens column for stock_x
bitcoin_tokens = []

for i in range(len(stock_x_df)):
    try:
        stock_x_tokens.append(tokenizer(stock_x_df['text'][i]))
    except AttributeError:
        pass

stock_x_df['tokens'] = stock_x_tokens
stock_x_df.head()

In [None]:
from collections import Counter
from nltk import ngrams

In [None]:
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return output

In [None]:
# Define the counter function
def word_counter(corpus): 
    # Combine all articles in corpus into one large string
    big_string = ' '.join(corpus)
    processed = process_text(big_string)
    top_10 = dict(Counter(processed).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['word', 'count'])

In [None]:
# Generate the stock x N-grams where N=2

n=2
bigram_counts_xc = []
for i in range(len(stock_x_df['text'])):
    bigram_counts_xc.append(Counter(ngrams(word_tokenize(stock_x_df['text'][i]), n)))

In [None]:
# Generate the stock x N-grams where N=2
# YOUR CODE HERE!
def bigram_counter(corpus): 
#    Combine all articles in corpus into one large string
    big_string = ' '.join(corpus)
    processed = process_text(big_string)
    bigrams = ngrams(processed, n=2)
    top_10 = dict(Counter(bigrams).most_common(10))
    return pd.DataFrame(list(top_10.items()), columns=['bigram', 'count'])
n=2
bigram_counts_stock_x = []
for i in range(len(stock_x_df['text'])):
    bigram_counts_eth.append(Counter(ngrams(word_tokenize(stock_x_df['text'][i]), n)))

In [None]:
#Name Entity Recognition

In [None]:
import spacy
from spacy import displacy

In [None]:
# Download the language model for SpaCy
!python -m spacy download en_core_web_sm

In [None]:
!pip install spacy

In [None]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

In [None]:
# Concatenate all of the stock x text together
stock_x_text = ''
for text in stock_x_df['text']:
    stock_x_text += str(text) 

In [None]:
# Run the NER processor on all of the text
stock_x_ner = nlp(stock_x_text)

In [None]:
# Render the visualization
displacy.render(stock_x_ner, style='ent')