In [22]:
import os
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [23]:
def preprocess_text(text):
    # Preprocessing function
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    if isinstance(text, str):
        # remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)  # r'[^\w\s]' : matches any character that is not a word character (alphanumeric or underscore) or a whitespace character
        # convert to lowercase
        text = text.lower()
        # tokenize text
        tokens = nltk.word_tokenize(text)
        # remove stop words
        tokens = [token for token in tokens if token not in stop_words]
        # lemmatize text
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # join tokens back into text
        text = ' '.join(tokens)
    return text

In [3]:
web_url = 'https://finviz.com/quote.ashx?t='

news_tables = {}
# tickers = ['AMZN', 'GOOG', 'TSLA']
tickers = ['AMZN']

for tick in tickers:
    url = web_url + tick
    req = Request(url=url,headers={"User-Agent": "Chrome"}) 
    response = urlopen(req)    
    html = BeautifulSoup(response,"html.parser")
    news_table = html.find(id='news-table')
    news_tables[tick] = news_table

In [None]:
news_tables['AMZN']

In [None]:
amazon = news_tables['AMZN']
amazon_tr = amazon.findAll('tr')

for x, table_row in enumerate(amazon_tr):
    a_text = table_row.a.text
    td_text = table_row.td.text
    print(a_text)
    print(td_text)
    if x == 3:
        break

In [None]:
#original code
news_list = []

for file_name, news_table in news_tables.items():
    for i in news_table.findAll('tr'):
        
        text = i.a.get_text() 
        
        date_scrape = i.td.text.split()
        
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        tick = file_name.split('_')[0]
        
        news_list.append([tick, date, time, text])

In [14]:
#My version

news_list = []
count = 0
for file_name, news_table in news_tables.items():
    for i in news_table.findAll('tr'):
        print("count tr")
        print(count)
        count+=1
        text = i.a.get_text()
         
        date_scrape = i.td.text.split()
        
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        else:
            date = date_scrape[0]
            time = date_scrape[1]

        tick = file_name.split('_')[0]
        try:
            url = i.a['href']
            parsed_url = urlparse(url)
            if parsed_url.netloc=='finance.yahoo.com':
                req = Request(url=url,headers={"User-Agent": "Chrome"}) 
                response = urlopen(req)
                if response.getcode() == 200:
                    html = BeautifulSoup(response,"html.parser")
                    print("title : ",i.a.get_text())

                    caas_body_div = html.find('div', class_='caas-body')

                    if caas_body_div: 
                        for p in caas_body_div.findAll('p'):
                            text += '\n' + p.text
                    # print("Content")
                    # print(text)
                    
                else:
                    print(f"Webscrapping failed: Unable to web scrape {url}",
                        response.status_code)
        except Exception as e:
            print(f"An error occurred: {str(e)}")
        finally:
            news_list.append([tick, date, time, text])



count tr
0
title :  3 Little-Known Ways to Save Money This October
count tr
1
title :  Amazons SpaceX Duel Heats Up as Tardy Satellites Set to Fly
count tr
2
title :  Amazon's Prime Big Deal Days Start on Oct. 10. Here Are 4 Tips to Score the Best Deals
count tr
3
title :  Market Today: Mixed Results for Major Indices, Aehr Test Systems Beats Q1 Estimates, Beverage ...
count tr
4
title :  Amazon is offering free grocery delivery on orders exceeding $100
count tr
5
title :  Amazons largest cargo jet makes debut
count tr
6
title :  Growth in AI, the cloud means tech stocks are 'going up': Analyst
count tr
7
title :  3 Mistakes to Avoid During the Prime Day Frenzy
count tr
8
title :  How to watch the NFLs Thursday Night Football Week 5 of the 2023-2024 season live online for freeand without cable
count tr
9
title :  4 Tips for Getting the Most Out of Prime Day Sales
count tr
10
title :  Amazon Prime day sales event to kick off holiday deals: Expert
count tr
11
title :  3 Things I Would Ne

In [16]:
parsed_url

ParseResult(scheme='https', netloc='finance.yahoo.com', path='/m/2f0cbbf7-a8c6-381f-9451-acfd2e967348/my-top-10-tech-stocks-for.html', params='', query='', fragment='')

In [15]:
len(news_list)

100

In [None]:
try:

    url = 'https://finance.yahoo.com/m/faab6fb7-c85c-32c9-a7bb-8ac838ea013b/3-little-known-ways-to-save.html'
    req = Request(url=url,headers={"User-Agent": "Chrome"}) 
    response = urlopen(req)
    print(response.getcode())
    if response.getcode() == 200:
        html = BeautifulSoup(response,"html.parser")
        print(html)
except Exception as e:
    print(e)
# response = urlopen(req)

In [None]:
len(news_table.findAll('tr'))

In [None]:
vader = SentimentIntensityAnalyzer()

processed_text = preprocess_text(text)
vader.polarity_scores(processed_text)

In [None]:
news_table = news_tables['AMZN']

i = news_table.findAll('tr')[1]
    
text = i.a.get_text() 
url = i.a['href']

req = Request(url=url,headers={"User-Agent": "Chrome"}) 
response = urlopen(req)
html = BeautifulSoup(response,"html.parser")
text_content = html.get_text()
print(text_content)

In [None]:
vader = SentimentIntensityAnalyzer()

vader.polarity_scores(text_content)

In [None]:
caas_body_div = html.find('div', class_='caas-body')
text = ""
for p in caas_body_div.findAll('p'):
    text += p.text + '\n'

print(text)

In [None]:
news_list

In [18]:
from datetime import datetime
def parse_today_as_date(value):
    if value == 'Today':
        return datetime.now().date()
    else:
        return value

In [30]:
vader = SentimentIntensityAnalyzer()

columns = ['ticker', 'date', 'time', 'text']

news_df = pd.DataFrame(news_list, columns=columns)

scores = news_df['text'].apply(preprocess_text).apply(vader.polarity_scores).tolist()

scores_df = pd.DataFrame(scores)

news_df = news_df.join(scores_df)

news_df['date'] = news_df['date'].apply(parse_today_as_date)


In [31]:
news_df

Unnamed: 0,ticker,date,time,text,neg,neu,pos,compound
0,AMZN,Oct-05-23,07:52PM,3 Little-Known Ways to Save Money This October...,0.058,0.628,0.314,0.8750
1,AMZN,Oct-05-23,06:38PM,Amazons SpaceX Duel Heats Up as Tardy Satellit...,0.050,0.826,0.124,0.9899
2,AMZN,Oct-05-23,06:00PM,Amazon's Prime Big Deal Days Start on Oct. 10....,0.000,0.718,0.282,0.8957
3,AMZN,Oct-05-23,05:30PM,"Market Today: Mixed Results for Major Indices,...",0.082,0.798,0.121,0.9017
4,AMZN,Oct-05-23,05:01PM,Amazon is offering free grocery delivery on or...,0.107,0.763,0.130,0.7096
...,...,...,...,...,...,...,...,...
95,AMZN,Oct-02-23,01:02PM,Amazon.com Inc (AMZN): A Deep Dive into Financ...,0.032,0.704,0.264,0.9988
96,AMZN,Oct-02-23,11:57AM,Amazon Is Bringing Ads to Prime Video. Its a B...,0.068,0.685,0.247,0.6369
97,AMZN,Oct-02-23,11:34AM,Amazon (AMZN) Zoox is Set to Establish a Facil...,0.006,0.781,0.213,0.9968
98,AMZN,Oct-02-23,11:18AM,Target's answer to Amazon Prime Days is here ...,0.000,0.811,0.189,0.3612


In [None]:
import yfinance as yf

amazon_data = yf.download("AMZN", start="2022-01-01", end="2023-10-10")
print(amazon_data)
