In [1]:
from langchain_community.llms import Ollama

In [2]:
llm = Ollama(model="llama3", cache=False, top_p = 0.9, top_k = 40, temperature=0.5)

In [5]:
llm.invoke("What did I just ask you?")

"You didn't ask me anything yet. This conversation just started, and your first message was an empty line. Would you like to ask me something now?"

In [None]:
import pandas as pd
import re
df = pd.read_csv("../sentiment_score.csv")
constituents = pd.read_csv("../s&p_constituents.csv")
names = [i.lower() for i in constituents['Security'].tolist()]
tickers = constituents['Symbol'].tolist()

current_ticker = "MSFT"
for headline in df['headline']:
    st = "{\"TSLA\": \"positive\", \"AAPL\": \"neutral\", \"GOOG\": \"negative\"} or {} if no companies mentioned"
    prompt = f"""
    The following financial news headline is about {current_ticker} but may mention other companies.
    Please provide the sentiment (positive, neutral, or negative) solely in relation to {current_ticker} 
    given the following headline in the brackets below. The sentiment should reflect the favorability of {current_ticker} for investors. 
    If you are not sure, answer neutral.
    [{headline}]
    """.strip()
    # Omit any explanation of sentiment analysis.
    # You are a financial analyst with knowledge of all stock tickers in the s&p 500. 
    # for chunks in llm.stream(prompt):
    #     print(chunks, end="")
    val = llm.invoke(prompt)
    # use regex to parse for dictionary, denoted by curly braces
    print("**HEADLINE**")
    print(headline)
    print("**RESPONSE**")
    print(val)
    

## Running the thing

In [4]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\25ben\AppData\Roaming\nltk_data...


True

In [2]:
import pandas as pd
import numpy as np
import random
# read from finvizurls.txt
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import date, datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()
import urllib.parse
import posixpath
class HistoricalSentiment:

    def __init__(self, ticker=None, fn=vader.polarity_scores):
        self.ticker = ticker
        self.fn = fn

    def find_articles(self, url):
        # url_req = f"{url}/quote.ashx" # ?t={self.ticker}
        # url_req = urllib.parse.urljoin(url, f"quote.ashx?t={self.ticker}")
        url_req = posixpath.join(url, f"quote.ashx?t={self.ticker}")
        print(url_req)

        req = Request(url=url_req, headers={"User-Agent": "FireFox"}) # I realize that aditya's version of the code doesn't use the right user agent
        response = urlopen(req)
        html = BeautifulSoup(response, "html.parser")
        news_table = html.find(id='news-table') # id
        news_tablev2 = html.find(class_='fullview-news-outer')
        # return whichever table is not None
        print("using news table" if news_table is not None else "using news table v2")
        
        return news_table if news_table is not None else news_tablev2
    
    def generate_news_df(self, news_table):
        news_list = []
        # oldest.datetime_timestamp
        # datetime.datetime(1998, 11, 11, 18, 45, 51)

        # TODO: filter based on time (i.e. use previous day to get news for next day)

        for i in news_table.findAll('tr'):
            try:
                text = i.a.get_text()
            except:
                continue

            date_scrape = i.td.text.split()
            source = i.div.span.get_text()

            if len(date_scrape) == 1:
                time = date_scrape[0]

            else:
                final_date = date_scrape[0]
                time = date_scrape[1]

                if final_date == "Today":
                    final_date = date.today().strftime("%Y-%m-%d")

            tick = self.ticker

            news_list.append([tick, final_date, time, source, text])

        columns = ['ticker', 'date', 'time', 'source', 'headline']
        news_df = pd.DataFrame(news_list, columns=columns)
        news_df['date'] = pd.to_datetime(news_df.date, format='mixed').dt.date

        # randomly select 40 headlines from 40 different days. This will have to be stratified by date
        # don't necessarily select 40 randomly, just take all
        # for i in range(40):
        #     news_df = news_df.sample(frac=1).groupby('date').head(1)
        print("length of news df", len(news_df))

        return news_df
    
    def calculate_sentiment(self, url):
        self.news_scraped = self.find_articles(url=url)
        self.news_df = self.generate_news_df(self.news_scraped)
        # requires that find_articles has been called and generated a news_df

        scores = self.news_df['headline'].apply(self.fn).tolist()
        scores = [x['compound'] for x in scores]
        sentiment = float(np.mean(scores))
        final_sentiment = round(sentiment, 4)
        # print(self.news_df.head())
        return self.news_df['headline'], final_sentiment

In [3]:
import time
with open("finvizurls_dates.txt", "r") as f:
    urls_select = f.readlines()

# urls_select = random.sample(range(len(urls)), 40)
# urls_select = [urls[i] for i in urls_select]
news_tables = {}

tickers = ["MSFT"]

print("urls selected", len(urls_select))

sentiments = {}
for idx, url in enumerate(urls_select):
    # save url to access metadata
    # beatiful soup to extract the text
    url = url.strip()
    print(f"=== URL {idx} ===")
    for ticker in tickers:
        obj_vader = HistoricalSentiment(ticker, vader.polarity_scores)
        headline, sentiment_vader = obj_vader.calculate_sentiment(url=url)
        # print("headlines:")
        # print(headline.head())
        print(f"aggregated sentiment for {ticker}:", sentiment_vader)
        sentiments[ticker] = sentiment_vader
        break

    # sleep for a little for each url
    time.sleep(3)

print("sentiments", sentiments)


# TODO: investigate after hours stock moving - how do we deal with this?

urls selected 676
=== URL 0 ===
https://web.archive.org/web/20220228005514/http://finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.0007
=== URL 1 ===
https://web.archive.org/web/20220301001745/https://finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.0007
=== URL 2 ===
https://web.archive.org/web/20220302011012/http://finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.0007
=== URL 3 ===
https://web.archive.org/web/20220303051854/http://www.finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.0007
=== URL 4 ===
https://web.archive.org/web/20220304194101/https://finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.0007
=== URL 5 ===
https://web.archive.org/web/20230803010522/https://finviz.com/quote.ashx?t=MSFT
length of news df 100
aggregated sentiment for MSFT: 0.1444
=== URL 6 ===
https://web.archive.org/web/20230804033501/

URLError: <urlopen error [WinError 10061] No connection could be made because the target machine actively refused it>

In [None]:
"""Your job is to determine the sentiment (positive, negative, neutral) corresponding
    to companies and their stock tickers, if any, explicitly mentioned in the given financial news headline. 
    Only determine sentiment corresponding to companies explicitly mentioned in the 
    headline, do not try to predict companies that might be in the article. 
    Print only what belongs in the braces. 
    Examples of outputs include {st}. Do not explain the output.
    If you are not sure, please don't share false information. 

    Headline: [Meet the Supercharged Growth Stock That's a Shoo-in to Join Microsoft in the $3 Trillion Club]
    Predicted sentiment: {"{}"}
    
    di = re.search(r'\{.*\}', val).group()
    # if the dictionary is not in the correct format, ensure the keys and values have quotes
    # convert any company names to corresponding ticker
    for name_idx, name in enumerate(names):
        for key in di:
            # if the key is a company name, replace it with the corresponding ticker
            # name will be multiple words long. if any part of the name is in the key, replace it
            for word in name.split():
                if word.lower() in key:
                    di = di.replace(key, tickers[name_idx])
    # remove any quotes around the keys
    # di = re.sub(r'"', '', di)
    di = re.sub(r'(\w+):', r'"\1":', di)

    print("di original", di)
    di_real = eval(di)
    print(headline, di_real)
    """