In [3]:
import os
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from urllib.error import HTTPError
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime


In [12]:
def preprocess_text(text):
    # Preprocessing function
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, str):
        # remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)  # r'[^\w\s]' : matches any character that is not a word character (alphanumeric or underscore) or a whitespace character
        # convert to lowercase
        text = text.lower()
        # tokenize text
        tokens = nltk.word_tokenize(text)
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        # lemmatize text
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # join tokens back into text
        text = ' '.join(tokens)
    return text

In [5]:
web_url = 'https://finviz.com/quote.ashx?t='

news_tables = {}
# tickers = ['AMZN', 'GOOG', 'TSLA']
tickers = ['META','AMZN','AAPL','NFLX','GOOGL']
try:

    for tick in tickers:
        url = web_url + tick
        req = Request(url=url,headers={"User-Agent": "Chrome"}) 
        response = urlopen(req)    
        html = BeautifulSoup(response,"html.parser")
        news_table = html.find(id='news-table')
        news_tables[tick] = news_table
except Exception as e:
    if isinstance(e,HTTPError) and e.code ==404:
        print(str(e) +" "+ url)

In [6]:

for file_name, news_table in news_tables.items():
    print('len of news_table',len(news_table))
    print(file_name)

len of news_table 201
META
len of news_table 201
AMZN
len of news_table 201
AAPL
len of news_table 201
NFLX
len of news_table 201
GOOGL


In [None]:
amazon = news_tables['AMZN']
amazon_tr = amazon.findAll('tr')

for x, table_row in enumerate(amazon_tr):
    a_text = table_row.a.text
    td_text = table_row.td.text
    print(a_text)
    print(td_text)
    if x == 3:
        break

In [None]:
#original code
news_list = []

for file_name, news_table in news_tables.items():
    for i in news_table.findAll('tr'):
        
        text = i.a.get_text() 
        
        date_scrape = i.td.text.split()
        
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        tick = file_name.split('_')[0]
        
        news_list.append([tick, date, time, text])

In [26]:
# My version

news_list = []
count = 0
for file_name, news_table in news_tables.items():
    for i in news_table.findAll('tr'):
        print("count tr")
        print(count)
        count += 1
        text = i.a.get_text()

        date_scrape = i.td.text.split()
        print(date_scrape)

        if len(date_scrape) == 1:
            time = date_scrape[0]

        else:
            date = date_scrape[0]
            if date == 'Today':
                date = datetime.now().date()
            else:
                parsed_date = datetime.strptime(date, "%b-%d-%y")
                date = parsed_date.strftime("%Y-%m-%d")
            time = date_scrape[1]

            datetime_str = f"{date} {time}"
            datetime_format = datetime.strptime(datetime_str, "%Y-%m-%d %I:%M%p")

        tick = file_name.split('_')[0]
        try:
            url = i.a['href']
            parsed_url = urlparse(url)
            if parsed_url.netloc == 'finance.yahoo.com':
                req = Request(url=url, headers={"User-Agent": "Chrome"})
                response = urlopen(req)
                if response.getcode() == 200:
                    html = BeautifulSoup(response, "html.parser")
                    print("title : ", i.a.get_text())

                    caas_body_div = html.find('div', class_='caas-body')

                    if caas_body_div:
                        for p in caas_body_div.findAll('p'):
                            text += '\n' + p.text
                    # print("Content")
                    # print(text)

                else:
                    print(f"Webscrapping failed: Unable to web scrape {url}",
                          response.status_code)
        except Exception as e:
            print(f"An error occurred: {str(e)}")
        finally:
            news_list.append([tick, date, time, text])


count tr
0
['Today', '10:20AM']
title :  Amazon leaked documents show an exciting change is on the way
count tr
1
['09:53AM']
title :  2 Stock-Split Artificial Intelligence (AI) Stocks to Buy Hand-Over Fist in October
count tr
2
['09:15AM']
title :  Hollywood strikes have cost the US economy 45,000 jobs since May
count tr
3
['09:00AM']
title :  Is Trending Stock Amazon.com, Inc. (AMZN) a Buy Now?
count tr
4
['08:15AM']
title :  Shift4 and Amazon Team Up for Checkout-Free Shopping: What Investors Need to Know
count tr
5
['08:00AM']
title :  New This Holiday Season: Discounts on Shipping Packages
count tr
6
['08:00AM']
title :  3 Top Cloud Stocks to Buy in October
count tr
7
['07:10AM']
title :  Etsy Still Manages to Outshine Amazon in This Key Area
count tr
8
['06:05AM']
title :  Amazon Just Found a Smart Way to Add Billions of Dollars in High-Margin Revenue -- but Prime Members Won't Like It One Bit
count tr
9
['05:50AM']
title :  Microsoft Invested in OpenAI. Amazon and Google Investe

KeyboardInterrupt: 

In [27]:
news_list

[['AMZN',
  datetime.date(2023, 10, 6),
  '10:20AM',
  "Amazon leaked documents show an exciting change is on the way\nWith the winter slowly approaching for many, there's some positive news about Amazon's grocery delivery service that comes right on time for those cold days. Amazon is lowering its free grocery delivery on Amazon Fresh for Prime members. Customers need to spend at least $100 to get the free delivery service, down from $150 which start in February 2023, according to a report by Insider."],
 ['AMZN',
  datetime.date(2023, 10, 6),
  '09:53AM',
  "2 Stock-Split Artificial Intelligence (AI) Stocks to Buy Hand-Over Fist in October\nInvestor psychology is hugely important in the stock market. Over the last few years, Nvidia (NASDAQ: NVDA) and Amazon (NASDAQ: AMZN) have used stock splits multiple times to manage their impressive share price growth. Let's discuss why they could continue to reward investors as they pivot to artificial intelligence (AI) technology."],
 ['AMZN',
 

In [18]:
from datetime import datetime
def parse_today_as_date(value):
    if value == 'Today':
        return datetime.now().date()
    else:
        return value

In [28]:
vader = SentimentIntensityAnalyzer()

columns = ['ticker', 'date', 'time', 'text']

news_df = pd.DataFrame(news_list, columns=columns)

scores = news_df['text'].apply(preprocess_text).apply(vader.polarity_scores).tolist()

scores_df = pd.DataFrame(scores)

news_df = news_df.join(scores_df)

# news_df['date'] = news_df['date'].apply(parse_today_as_date)
news_df

Unnamed: 0,ticker,date,time,text,neg,neu,pos,compound
0,AMZN,2023-10-06,10:20AM,Amazon leaked documents show an exciting chang...,0.067,0.58,0.353,0.9451
1,AMZN,2023-10-06,09:53AM,2 Stock-Split Artificial Intelligence (AI) Sto...,0.0,0.623,0.377,0.9628
2,AMZN,2023-10-06,09:15AM,"Hollywood strikes have cost the US economy 45,...",0.089,0.779,0.131,0.9644
3,AMZN,2023-10-06,09:00AM,"Is Trending Stock Amazon.com, Inc. (AMZN) a Bu...",0.017,0.757,0.225,0.9987
4,AMZN,2023-10-06,08:15AM,Shift4 and Amazon Team Up for Checkout-Free Sh...,0.0,0.928,0.072,0.34
5,AMZN,2023-10-06,08:00AM,New This Holiday Season: Discounts on Shipping...,0.056,0.674,0.27,0.9412
6,AMZN,2023-10-06,08:00AM,3 Top Cloud Stocks to Buy in October\nThe clou...,0.037,0.809,0.154,0.5859
7,AMZN,2023-10-06,07:10AM,Etsy Still Manages to Outshine Amazon in This ...,0.0,0.634,0.366,0.9423
8,AMZN,2023-10-06,06:05AM,Amazon Just Found a Smart Way to Add Billions ...,0.101,0.724,0.175,0.3979
9,AMZN,2023-10-06,05:50AM,Microsoft Invested in OpenAI. Amazon and Googl...,0.0,0.92,0.08,0.34


In [23]:
from database_handler import parse_date_columns
parse_date_columns(news_df)

With the winter slowly approaching for many, there's some positive news about Amazon's grocery delivery service that comes right on time for those cold days. Amazon is lowering its free grocery delivery on Amazon Fresh for Prime members. Customers need to spend at least $100 to get the free delivery service, down from $150 which start in February 2023, according to a report by Insider.


In [24]:
news_df

Unnamed: 0,ticker,date,time,text,neg,neu,pos,compound
0,AMZN,2023-10-06,2023-10-06 10:20:00,Amazon leaked documents show an exciting chang...,0.067,0.58,0.353,0.9451
1,AMZN,2023-10-06,2023-10-06 09:53:00,2 Stock-Split Artificial Intelligence (AI) Sto...,0.0,0.623,0.377,0.9628
2,AMZN,2023-10-06,2023-10-06 09:15:00,"Hollywood strikes have cost the US economy 45,...",0.089,0.779,0.131,0.9644
3,AMZN,2023-10-06,2023-10-06 09:00:00,"Is Trending Stock Amazon.com, Inc. (AMZN) a Bu...",0.017,0.757,0.225,0.9987
4,AMZN,2023-10-06,2023-10-06 08:15:00,Shift4 and Amazon Team Up for Checkout-Free Sh...,0.0,0.928,0.072,0.34
5,AMZN,2023-10-06,2023-10-06 08:00:00,New This Holiday Season: Discounts on Shipping...,0.056,0.674,0.27,0.9412
6,AMZN,2023-10-06,2023-10-06 08:00:00,3 Top Cloud Stocks to Buy in October\nThe clou...,0.037,0.809,0.154,0.5859
7,AMZN,2023-10-06,2023-10-06 07:10:00,Etsy Still Manages to Outshine Amazon in This ...,0.0,0.634,0.366,0.9423
8,AMZN,2023-10-06,2023-10-06 06:05:00,Amazon Just Found a Smart Way to Add Billions ...,0.101,0.724,0.175,0.3979
9,AMZN,2023-10-06,2023-10-06 05:50:00,Microsoft Invested in OpenAI. Amazon and Googl...,0.0,0.92,0.08,0.34


In [None]:
try:

    url = 'https://finance.yahoo.com/m/faab6fb7-c85c-32c9-a7bb-8ac838ea013b/3-little-known-ways-to-save.html'
    req = Request(url=url,headers={"User-Agent": "Chrome"}) 
    response = urlopen(req)
    print(response.getcode())
    if response.getcode() == 200:
        html = BeautifulSoup(response,"html.parser")
        print(html)
except Exception as e:
    print(e)
# response = urlopen(req)

In [None]:
len(news_table.findAll('tr'))

In [None]:
vader = SentimentIntensityAnalyzer()

processed_text = preprocess_text(text)
vader.polarity_scores(processed_text)

In [None]:
news_table = news_tables['AMZN']

i = news_table.findAll('tr')[1]
    
text = i.a.get_text() 
url = i.a['href']

req = Request(url=url,headers={"User-Agent": "Chrome"}) 
response = urlopen(req)
html = BeautifulSoup(response,"html.parser")
text_content = html.get_text()
print(text_content)

In [None]:
vader = SentimentIntensityAnalyzer()

vader.polarity_scores(text_content)

In [None]:
caas_body_div = html.find('div', class_='caas-body')
text = ""
for p in caas_body_div.findAll('p'):
    text += p.text + '\n'

print(text)

In [None]:
news_list

In [18]:
from datetime import datetime
def parse_today_as_date(value):
    if value == 'Today':
        return datetime.now().date()
    else:
        return value

In [30]:
vader = SentimentIntensityAnalyzer()

columns = ['ticker', 'date', 'time', 'text']

news_df = pd.DataFrame(news_list, columns=columns)

scores = news_df['text'].apply(preprocess_text).apply(vader.polarity_scores).tolist()

scores_df = pd.DataFrame(scores)

news_df = news_df.join(scores_df)

news_df['date'] = news_df['date'].apply(parse_today_as_date)


In [31]:
news_df

Unnamed: 0,ticker,date,time,text,neg,neu,pos,compound
0,AMZN,Oct-05-23,07:52PM,3 Little-Known Ways to Save Money This October...,0.058,0.628,0.314,0.8750
1,AMZN,Oct-05-23,06:38PM,Amazons SpaceX Duel Heats Up as Tardy Satellit...,0.050,0.826,0.124,0.9899
2,AMZN,Oct-05-23,06:00PM,Amazon's Prime Big Deal Days Start on Oct. 10....,0.000,0.718,0.282,0.8957
3,AMZN,Oct-05-23,05:30PM,"Market Today: Mixed Results for Major Indices,...",0.082,0.798,0.121,0.9017
4,AMZN,Oct-05-23,05:01PM,Amazon is offering free grocery delivery on or...,0.107,0.763,0.130,0.7096
...,...,...,...,...,...,...,...,...
95,AMZN,Oct-02-23,01:02PM,Amazon.com Inc (AMZN): A Deep Dive into Financ...,0.032,0.704,0.264,0.9988
96,AMZN,Oct-02-23,11:57AM,Amazon Is Bringing Ads to Prime Video. Its a B...,0.068,0.685,0.247,0.6369
97,AMZN,Oct-02-23,11:34AM,Amazon (AMZN) Zoox is Set to Establish a Facil...,0.006,0.781,0.213,0.9968
98,AMZN,Oct-02-23,11:18AM,Target's answer to Amazon Prime Days is here ...,0.000,0.811,0.189,0.3612


In [None]:
import yfinance as yf

amazon_data = yf.download("AMZN", start="2022-01-01", end="2023-10-10")
print(amazon_data)
