In [1]:
from transformers import pipeline
import nltk
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
summarizer = pipeline("summarization", truncation=True)
analyzer = SentimentIntensityAnalyzer()
token_analyzer = SentimentAnalyzer()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment


def article_summarizer(article):
    summary = summarizer(article, max_length=300, min_length=100, do_sample=False)
    
    return summary[0]['summary_text']

def get_companies_names():
    db = next(get_db())
    try:
        stocks = db.query(Stock.title).all()
        return stocks
    finally:
        db.close()

___Basic version of the algorithm___

Analyzes pre-processed text. Processing includes: removing advertisements, removing unnecessary abbreviations and abbreviations, repeating the same thing.

In [6]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            sum_article = article_summarizer(article_text)
            analyze = analyze_sentiment(sum_article)
            if analyze['pos'] > analyze['neg']:
                pos_mark += 1
            else:
                neg_mark += 1
        articles_marks.append({'stock': company_name, 'positives': pos_mark, 'negatives': neg_mark})
        break
base_articles_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
print(base_articles_df)

Your max_length is set to 300, but your input_length is only 83. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Your max_length is set to 300, but your input_length is only 230. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=115)


                stock  positives  negatives
0  NVIDIA Corporation          3          0


__Adding news sentiment scores for each stock to the database.__

In [4]:
from sqlalchemy import text
from database.models import Stock, StockSentiment
from database.db import get_db

def add_to_db(df):
    db = next(get_db())
    try:
        db.query(StockSentiment).delete()
        db.commit()

        db.execute(text("ALTER SEQUENCE stock_sentiments_id_seq RESTART WITH 1"))
        db.commit()
        
        print("Updating stock sentiments...")

        stock_sentiments = []
        for index, row in df.iterrows():
            stock = db.query(Stock).filter_by(title=row['stock']).first()
            if stock:
                stock_sentiment = StockSentiment(
                    stock_id=stock.id,
                    positives=row['positives'],
                    negatives=row['negatives']
                )
                stock_sentiments.append(stock_sentiment)
            else:
                print(f"Stock '{row['stock']}' not found in the Stock table")

        db.bulk_save_objects(stock_sentiments)
        db.commit()

        print("Stock sentiments updated successfully.")
    except Exception as e:
        db.rollback()
        print(f"Failed to update stock sentiments: {e}")
    finally:
        db.close()
        
    
def clear_db():
    db = next(get_db())
    try:
        db.query(StockSentiment).delete()
        db.commit()
        print("Database cleared successfully.")
    except Exception as e:
        db.rollback()
        print(f"Failed to update stock sentiments: {e}")
    finally:
        db.close()

In [7]:
add_to_db(base_articles_df)

Updating stock sentiments...
Stock sentiments updated successfully.


The second version of the algorithm using tokenization.

In [11]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))
stop_words = set(stopwords.words("english"))
articles_list = []

def delete_punctuation(article):
    pattern = r'[^\w\s]'
    article = re.sub(pattern,'',article)
    return article


def sentence_tokenization(sentence):
    article_word_list = nltk.tokenize.word_tokenize(sentence)
    return article_word_list


def article_tokenization(article):
    sentences = nltk.tokenize.sent_tokenize(article)
    return sentences


def delete_stop_words(article_list):
    global stop_words
    processed_article = [word for word in article_list if word.lower() not in stop_words]
    return processed_article


def analyze_tokenized_sentiment(tokenized_article):
    sentiment_scores = [token_analyzer.all_words(word) for word in tokenized_article if word.isalpha()]
    return sentiment_scores

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        articles_list = []
        for filename in os.listdir(folder_path):
            words_list = []
            sentences_list = []
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            article = re.sub(r'\.(?!\s)', '. ', article_text)
            sentences = article_tokenization(article_text)
            for sentence in sentences:
                sentence = delete_punctuation(sentence)
                tokenized_sentence = sentence_tokenization(sentence)
                without_base_sw = delete_stop_words(tokenized_sentence)
                words_list.append(without_base_sw)
            for word_list in words_list:
                sentence = ' '.join(word_list)
                
                sentence = sentence.capitalize()
                
                sentence = re.sub(r'([.!?])(\w)', r'\1 \2', sentence)
                sentences_list.append(sentence)
            articles_list.append(sentences_list)
        pos_counter = 0 
        neg_counter = 0
        for article in articles_list:
            compound_counter: float = 0
            for sentence in article:
                if analyze_sentiment(sentence)["neg"] > analyze_sentiment(sentence)["pos"]:
                    neg_counter += 1
                elif analyze_sentiment(sentence)["pos"] > analyze_sentiment(sentence)["neg"]:
                    pos_counter += 1
        articles_marks.append({'stock': company_name, 'positives': pos_counter, 'negatives': neg_counter})
    

tokenized_sentences_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
tokenized_sentences_df

This is The Takeaway from today's Morning Brief, which you can sign up to receive in your inbox every morning along with:The chart of the dayWhat we're watchingWhat we're readingEconomic data releases and earningsAnd just like that, everyone is a recession expert. Two weeks ago, most self-proclaimed finance experts hadn't uttered the word recession since it was fashionable in late 2022/early 2023. From late July to early August, the prevailing sentiment of those seemingly in the know was 1) Nvidia (NVDA) shares were due for another 50% move after earnings on Aug. 28; 2) a 10% year-end rally for the S&P 500; and 3) a 100% move in Nvidia's stock price in 2025. Yet here we are, with the pros scaring the heck out of everyone the past week on the potential for a recession after a "bad" jobs report last Friday. Two top Wall Street banks raised their recession probabilities this week, for example. These pros have voiced their concerns on TV, social media, and in research reports, but they als

In [10]:
add_to_db(tokenized_sentences_df)

Updating stock sentiments...
Stock sentiments updated successfully.


In [None]:
db = next(get_db())
db.query(StockSentiment).delete()
db.commit()

In [35]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))
stop_words = set(stopwords.words("english"))
articles_list = []

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        articles_list = []
        for filename in os.listdir(folder_path):
            words_list = []
            sentences_list = []
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            article = re.sub(r'\.(?!\s)', '. ', article_text)
            sentences = article_tokenization(article_text)
            for sentence in sentences:
                sentence = delete_punctuation(sentence)
                tokenized_sentence = sentence_tokenization(sentence)
                without_base_sw = delete_stop_words(tokenized_sentence)
                words_list.append(without_base_sw)
            for word_list in words_list:
                sentence = ' '.join(word_list)
                
                sentence = sentence.capitalize()
                sentence = sentence + "."
                
                sentences_list.append(sentence)
            articles_list.append(sentences_list)
        pos_counter = 0 
        neg_counter = 0
        full_articles_list = []
        for article in articles_list:
            full_article = ""
            for sentence in article:
                full_article = full_article + sentence + " "
            full_articles_list.append(full_article)
            
        for article_text in full_articles_list:
            if analyze_sentiment(article_text)["compound"] <= 0:
                neg_counter += 1
            elif analyze_sentiment(article_text)["compound"] > 0:
                pos_counter += 1
            print(analyze_sentiment(article_text))
        articles_marks.append({'stock': company_name, 'positives': pos_counter, 'negatives': neg_counter})
        
tokenized_articles_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
tokenized_articles_df

{'neg': 0.059, 'neu': 0.859, 'pos': 0.082, 'compound': 0.9186}
{'neg': 0.069, 'neu': 0.812, 'pos': 0.119, 'compound': 0.9887}
{'neg': 0.051, 'neu': 0.861, 'pos': 0.087, 'compound': 0.8467}
{'neg': 0.069, 'neu': 0.817, 'pos': 0.115, 'compound': 0.967}
{'neg': 0.028, 'neu': 0.777, 'pos': 0.195, 'compound': 0.9971}
{'neg': 0.059, 'neu': 0.832, 'pos': 0.109, 'compound': 0.9363}
{'neg': 0.056, 'neu': 0.764, 'pos': 0.18, 'compound': 0.9952}
{'neg': 0.028, 'neu': 0.773, 'pos': 0.198, 'compound': 0.9956}
{'neg': 0.061, 'neu': 0.777, 'pos': 0.162, 'compound': 0.9851}
{'neg': 0.033, 'neu': 0.811, 'pos': 0.156, 'compound': 0.9783}
{'neg': 0.01, 'neu': 0.812, 'pos': 0.178, 'compound': 0.9916}
{'neg': 0.028, 'neu': 0.773, 'pos': 0.198, 'compound': 0.9956}
{'neg': 0.037, 'neu': 0.827, 'pos': 0.136, 'compound': 0.9843}
{'neg': 0.053, 'neu': 0.69, 'pos': 0.257, 'compound': 0.9963}
{'neg': 0.043, 'neu': 0.787, 'pos': 0.17, 'compound': 0.9966}
{'neg': 0.054, 'neu': 0.759, 'pos': 0.187, 'compound': 0.982

Unnamed: 0,stock,positives,negatives
0,Tesla,6,0
1,NVIDIA,5,0
2,Alphabet A,5,0
3,CrowdStrike Holdings,5,0
4,Apple,4,0
5,Amazon.com,7,0
6,Verizon,7,0
7,AMC Entertainment,5,0
8,Alphabet C,8,0
9,Coinbase Global,4,0


In [5]:
import vertexai
from vertexai.generative_models import GenerativeModel

project_id = "phonic-obelisk-431915-c8"

vertexai.init(project=project_id, location="europe-west2")

model = GenerativeModel("gemini-1.5-flash-001")

In [6]:
from time import sleep

articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))
stop_words = set(stopwords.words("english"))
articles_list = []


def delete_superfluous(article):
    pattern_1 = r'\(\w+\)'
    article = re.sub(pattern_1,'',article)
    pattern_2 = r'\(NASDAQ: \w+\)'
    article = re.sub(pattern_2,'',article)
    return article

def delete_punctuation(article):
    pattern = r'[^\w\s]'
    article = re.sub(pattern,'',article)
    return article


def sentence_tokenization(sentence):
    article_word_list = nltk.tokenize.word_tokenize(sentence)
    return article_word_list


def article_tokenization(article):
    sentences = nltk.tokenize.sent_tokenize(article)
    return sentences


def delete_stop_words(article_list):
    global stop_words
    processed_article = [word for word in article_list if word.lower() not in stop_words]
    return processed_article


def request_processing(request):
    response = model.generate_content(request)
    return response.text


# def analyze_tokenized_sentiment(tokenized_article):
#     sentiment_scores = [token_analyzer.all_words(word) for word in tokenized_article if word.isalpha()]
#     return sentiment_scores
c = 0
while c<25:
    for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
        for dir_name, company_name in zip(dirs, companies_names):
            
            request = f"I have several financial articles about {dir_name}. I want you to analyze each one in turn and provide the result in the form of: 3 sentences that best describe what the article is about, and also I want you to provide a forecast based on this news, with what chance the stock price will go up and with what chance it will go down in the format: '(decrease 30% | increase 70%)' - the total should be 100%. And also how useful is this article for predicting the rise/fall of a stock in the format '(Informativeness: 50%)'. Here is one of the articles: \n\n"
            
            folder_path = os.path.join(root, dir_name)
            pos_mark = 0
            neg_mark = 0
            articles_list = []
            for filename in os.listdir(folder_path):
                words_list = []
                sentences_list = []
                file_path = os.path.join(folder_path, filename)
                article_file = open(file_path, 'r', encoding="utf-8")
                article_text = article_file.read()
                article = re.sub(r'\.(?!\s)', '. ', article_text)
                sentences = article_tokenization(article_text)
                for sentence in sentences:
                    sentence = delete_superfluous(sentence)
                    sentence = delete_punctuation(sentence)
                    tokenized_sentence = sentence_tokenization(sentence)
                    without_base_sw = delete_stop_words(tokenized_sentence)
                    words_list.append(without_base_sw)
                for word_list in words_list:
                    sentence = ' '.join(word_list)
                    
                    sentence = sentence.capitalize()
                    
                    sentence = re.sub(r'([.!?])(\w)', r'\1 \2', sentence)
                    sentences_list.append(sentence)
                articles_list.append(sentences_list)
            pos_counter = 0 
            neg_counter = 0
            responses = []
            for article in articles_list:
                full_article = ". ".join(article)
                full_request = request + full_article
                response = request_processing(full_request)
                responses.append(response)
                print(response)
                sleep(30)
                c = c + 1
        
    

# tokenized_sentences_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
# tokenized_sentences_df

This article discusses the recent market volatility and the possibility of a recession. It analyzes various economic indicators, including the ISM Services Report, initial jobless claims, and corporate earnings, and concludes that the current economic situation doesn't justify the market's panic.  It argues that the recent market downturn is more likely a result of a gradual cooling of the economy than a full-blown recession, especially considering the strength of the labor market and the possibility of future Fed rate cuts.

**(Decrease 20% | Increase 80%)**

**(Informativeness: 60%)**

The article provides a balanced perspective on the potential for a recession, analyzing data and considering various factors. While it argues against a recession, it acknowledges the risks and the potential for a gradual economic slowdown. It also highlights the importance of monitoring economic indicators and the potential impact of Fed actions. However, it lacks specific details about AMD's performan

In [8]:
forecast_pattern_1 = re.compile(r'\(decrease (\d+%) \| increase (\d+%)\)')
forecast_pattern_2 = re.compile(r'\(increase (\d+%) \| decrease (\d+%)\)')
informativeness_pattern = re.compile(r'\(informativeness: (\d+%)\)')

# Extract forecasts and informativeness
data = []

for response in responses:
    lower_response = response.lower()
    forecasts = forecast_pattern_1.findall(lower_response)
    if forecasts == []:
        swap = forecast_pattern_2.findall(lower_response)
        forecasts = [(swap[0][1], swap[0][0])]
    informativeness = informativeness_pattern.findall(lower_response)

    data.append({
        'Decrease Probability': int(forecasts[0][0].replace('%', '')),
        'Increase Probability': int(forecasts[0][1].replace('%', '')),
        'Informativeness': int(informativeness[0].replace('%', ''))
    })

df = pd.DataFrame(data)

df

Unnamed: 0,Decrease Probability,Increase Probability,Informativeness
0,20,80,60
1,50,50,70
2,10,90,70
3,25,75,70
4,20,80,70
5,20,80,70
6,10,90,70
7,20,80,70


In [28]:
patterns_to_remove = [
    r"\*\*.*?\*\*|##.*?##', '"
    r"\n\s*\n', '\n\n",
    r"##.*",
    r'\(.*?\)',
    r'\b\d+\.',
    r'\*',
]

# Process text
def remove_lines(text, patterns):
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    # paragraphs = text.split('\n')
    # clean_paragraphs = []
    # for i in paragraphs:
    #     if len(i)>2:
    #         clean_paragraphs.append(i)
    # return text
    

for response in responses:
    cleaned_text = remove_lines(response, patterns_to_remove)
    print(cleaned_text)
    print("\n" + "-"*80 + "\n")



This article discusses the recent market volatility and the possibility of a recession. It analyzes various economic indicators, including the ISM Services Report, initial jobless claims, and corporate earnings, and concludes that the current economic situation doesn't justify the market's panic.  It argues that the recent market downturn is more likely a result of a gradual cooling of the economy than a full-blown recession, especially considering the strength of the labor market and the possibility of future Fed rate cuts.
The article provides a balanced perspective on the potential for a recession, analyzing data and considering various factors. While it argues against a recession, it acknowledges the risks and the potential for a gradual economic slowdown. It also highlights the importance of monitoring economic indicators and the potential impact of Fed actions. However, it lacks specific details about AMD's performance and its potential impact on the stock price.

---------------