In [8]:
from openai import OpenAI
import pandas as pd
import os

## sentiment algo

In [5]:
import pandas as pd
import numpy as np
import random
# read from finvizurls.txt
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from datetime import date, datetime
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

class HistoricalSentiment:

    def __init__(self, ticker, fn=vader.polarity_scores):
        self.ticker = ticker
        self.fn = fn

    def find_articles(self, url):
        url_req = f"{url}/quote.ashx?t={self.ticker}"
        req = Request(url=url_req, headers={"User-Agent": "FireFox"}) # I realize that aditya's version of the code doesn't use the right user agent
        response = urlopen(req)
        html = BeautifulSoup(response, "html.parser")
        news_table = html.find(id='news-table')

        return news_table
    
    def generate_news_df(self, news_table):
        news_list = []
        # oldest.datetime_timestamp
        # datetime.datetime(1998, 11, 11, 18, 45, 51)

        # TODO: filter based on time (i.e. use previous day to get news for next day)

        for i in news_table.findAll('tr'):
            try:
                text = i.a.get_text()
            except:
                continue

            date_scrape = i.td.text.split()
            source = i.div.span.get_text()

            if len(date_scrape) == 1:
                time = date_scrape[0]

            else:
                final_date = date_scrape[0]
                time = date_scrape[1]

                if final_date == "Today":
                    final_date = date.today().strftime("%Y-%m-%d")

            tick = self.ticker

            news_list.append([tick, final_date, time, source, text])

        columns = ['ticker', 'date', 'time', 'source', 'headline']
        news_df = pd.DataFrame(news_list, columns=columns)
        news_df['date'] = pd.to_datetime(news_df.date, format='mixed').dt.date

        # randomly select 40 headlines from 40 different days. This will have to be stratified by date
        # don't necessarily select 40 randomly, just take all
        # for i in range(40):
        #     news_df = news_df.sample(frac=1).groupby('date').head(1)
        print("length of news df", len(news_df))

        return news_df
    
    def calculate_sentiment(self, url):
        self.news_scraped = self.find_articles(url=url)
        self.news_df = self.generate_news_df(self.news_scraped)
        # requires that find_articles has been called and generated a news_df

        scores = self.news_df['headline'].apply(self.fn).tolist()
        scores = [x['compound'] for x in scores]
        sentiment = float(np.mean(scores))
        final_sentiment = round(sentiment, 4)
        # print(self.news_df.head())
        return self.news_df['headline'], final_sentiment

In [7]:
with open("finvizurls_test.txt", "r") as f:
    urls_select = f.readlines()

# urls_select = random.sample(range(len(urls)), 40)
# urls_select = [urls[i] for i in urls_select]
news_tables = {}

tickers = ["MSFT"]

print("urls selected", len(urls_select))

sentiments = {}
for url in urls_select:
    # save url to access metadata
    # beatiful soup to extract the text
    url = url.strip()
    for ticker in tickers:
        obj_vader = HistoricalSentiment(ticker, vader.polarity_scores)
        headline, sentiment_vader = obj_vader.calculate_sentiment(url=url)
        print("headlines:")
        print(headline.head())
        print("aggregated sentiment:", sentiment_vader)
        sentiments[ticker] = sentiment_vader
        break

print("sentiments", sentiments)


# TODO: investigate after hours stock moving - how do we deal with this?

urls selected 409
length of news df 100
headlines:
0    Google Beats Microsoft To Make Cloud Cybersecu...
1    Explainer-Will Big Tech cloud companies cut of...
2    Google Has A New Cybersecurity Game Plan To Ke...
3    EXPLAINER-Will Big Tech cloud companies cut of...
4    Russia-Ukraine war: Companies that have taken ...
Name: headline, dtype: object
aggregated sentiment: 0.0007
length of news df 100
headlines:
0    Google Beats Microsoft To Make Cloud Cybersecu...
1    Explainer-Will Big Tech cloud companies cut of...
2    Google Has A New Cybersecurity Game Plan To Ke...
3    EXPLAINER-Will Big Tech cloud companies cut of...
4    Russia-Ukraine war: Companies that have taken ...
Name: headline, dtype: object
aggregated sentiment: 0.0007
length of news df 100
headlines:
0    Google Beats Microsoft To Make Cloud Cybersecu...
1    Explainer-Will Big Tech cloud companies cut of...
2    Google Has A New Cybersecurity Game Plan To Ke...
3    EXPLAINER-Will Big Tech cloud companies cut o

KeyboardInterrupt: 

## OpenAI

In [None]:
openai = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url="https://api.deepinfra.com/v1/openai",
)

prompt = """
Task: Identify Company Tickers and Sentiments

Instructions:
1. Given the header provided, identify the names of the companies explicitly mentioned.
2. Determine the corresponding ticker symbols for each identified company.
3. Assess the sentiment associated with each company (buying or not buying the stock) mentioned in the header (positive, neutral, negative).
4. Return the results in JSON format with each company's ticker symbol mapped to its sentiment. Do not include any details beyond this. 

Do not assume the names of any companies in the header. Only include companies that are explicitly mentioned and exclude all else.

Example Input:
Header: "Analysts predict positive outlook for Apple Inc. (AAPL) and minimal growth for Tesla (TSLA)."

Example Output:
{"AAPL": "positive", "TSLA": "negative"}

Header: [OpenAI could launch new AI search engine on Monday: Reuters]

Only print the json. 

""".strip()

chat_completion = openai.chat.completions.create(
    model="meta-llama/Meta-Llama-3-70B-Instruct",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=200
)
# chat_completion = openai.chat.completions.create(
#     model="meta-llama/Meta-Llama-3-8B-Instruct",
#     messages=[{"role": "user", "content": prompt}],
# )

print(chat_completion.choices[0].message.content)