In [28]:
from waybackpy import WaybackMachineSaveAPI, WaybackMachineCDXServerAPI
import pandas as pd
import numpy as np
import os
import random
# read from finvizurls.txt
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request

In [2]:
# https://web.archive.org/web/20240000000000*/finviz.com
tickers = ["MSFT"]
web_url = "https://finviz.com" # /quote.ashx?t=
# user_agent = "my new app's user agent"
# start timestamp is 2 years ago
cdx_api = WaybackMachineCDXServerAPI(web_url, start_timestamp=2021, end_timestamp=2023)

In [None]:
for item in cdx_api.snapshots():
    print(item.archive_url) # I just copied and pasted into finvizurls.txt

In [22]:

# not just recognizing sentiment, but also recognizing whether microsoft is actually in it or not? Filtering based on names? Downweighting
# those that don't have msft in it?
random.seed(0) # deterministic
news_tables = {}

with open("finvizurls.txt", "r") as f:
    urls = f.readlines()
    # randomly generate 40 urls to save
    urls_select = random.sample(range(len(urls)), 40)
    for url in urls_select:
        # save url to access metadata
        # beatiful soup to extract the text
        """
        eq = Request(url=url, headers={"User-Agent": "FireFox"})
        response = urlopen(req)
        html = BeautifulSoup(response, "html.parser")
        news_table = html.find(id='news-table')
        news_tables[tick] = news_table
        """
        url = url.strip()
        for ticker in tickers:
            url_req = f"{url}/quote.ashx?t={ticker}"
            req = Request(url=url_req, headers={"User-Agent": "FireFox"}) # I realize that aditya's version of the code doesn't use the right user agent
            response = urlopen(req)
            html = BeautifulSoup(response, "html.parser")
            news_table = html.find(id='news-table')
            news_tables[ticker] = news_table
            # print("news table", news_table)
        break

In [None]:
print(news_tables["MSFT"])

In [24]:
from datetime import date
news_list = []

for file_name, news_table in news_tables.items():
    for i in news_table.findAll('tr'):
        try:
            text = i.a.get_text()
        except:
            continue

        date_scrape = i.td.text.split()
        source = i.div.span.get_text()

        if len(date_scrape) == 1:
            time = date_scrape[0]

        else:
            final_date = date_scrape[0]
            time = date_scrape[1]

            if final_date == "Today":
                final_date = date.today().strftime("%Y-%m-%d") # b d y??

        tick = file_name.split('_')[0]

        news_list.append([tick, final_date, time, source, text])

columns = ['ticker', 'date', 'time', 'source', 'headline']
news_df = pd.DataFrame(news_list, columns=columns)
news_df['date'] = pd.to_datetime(news_df.date, format='mixed').dt.date # format mixed??

In [26]:
print("news df", news_df.tail())

news df    ticker        date     time                      source  \
95   MSFT  2020-12-22  02:08PM                Investopedia   
96   MSFT  2020-12-22  02:05PM                Investopedia   
97   MSFT  2020-12-22  02:01PM                Investopedia   
98   MSFT  2020-12-22  12:13PM   Investor's Business Daily   
99   MSFT  2020-12-22  12:00PM               TheStreet.com   

                                             headline  
95                            Companies Owned by MSFT  
96                         Top Microsoft Shareholders  
97  How Microsoft Makes Money: Personal Computing,...  
98  Dow Jones Drops Despite Apple Stock's Gain; Tr...  
99                    Does Microsoft Pass Our 'Test'?  


In [32]:
print(news_df.iloc[0])

ticker                                                   MSFT
date                                               2020-12-31
time                                                  05:03PM
source                                            Barrons.com
headline    Section 230 Keeps Coming Up in Congress. Heres...
Name: 0, dtype: object


In [29]:
# compare vader polarity vs our version
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

def get_scores(df):
    scores = df['headline'].apply(vader.polarity_scores).tolist()
    scores = [x['compound'] for x in scores]
    sentiment = float(np.mean(scores))
    final_sentiment = round(sentiment, 4)
    return final_sentiment

print("vader", get_scores(news_df))

vader 0.0844
