In [28]:
from transformers import pipeline
import nltk
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment import SentimentAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
summarizer = pipeline("summarization", truncation=True)
analyzer = SentimentIntensityAnalyzer()
token_analyzer = SentimentAnalyzer()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment


def article_summarizer(article):
    summary = summarizer(article, max_length=300, min_length=100, do_sample=False)
    
    return summary[0]['summary_text']

def get_companies_names():
    db = next(get_db())
    try:
        stocks = db.query(Stock.title).all()
        return stocks
    finally:
        db.close()

___Basic version of the algorithm___

Analyzes pre-processed text. Processing includes: removing advertisements, removing unnecessary abbreviations and abbreviations, repeating the same thing.

In [23]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            sum_article = article_summarizer(article_text)
            analyze = analyze_sentiment(sum_article)
            if analyze['pos'] > analyze['neg']:
                pos_mark += 1
            else:
                neg_mark += 1
        articles_marks.append({'stock': company_name, 'positives': pos_mark, 'negatives': neg_mark})
        break
base_articles_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
print(base_articles_df)

       stock  positives  negatives
0  Microsoft          2          4


__Adding news sentiment scores for each stock to the database.__

In [5]:
from sqlalchemy import text
from database.models import Stock, StockSentiment
from database.db import get_db

def add_to_db(df):
    db = next(get_db())
    try:
        db.query(StockSentiment).delete()
        db.commit()

        db.execute(text("ALTER SEQUENCE stock_sentiments_id_seq RESTART WITH 1"))
        db.commit()
        
        print("Updating stock sentiments...")

        stock_sentiments = []
        for index, row in df.iterrows():
            stock = db.query(Stock).filter_by(title=row['stock']).first()
            if stock:
                stock_sentiment = StockSentiment(
                    stock_id=stock.id,
                    positives=row['positives'],
                    negatives=row['negatives']
                )
                stock_sentiments.append(stock_sentiment)
            else:
                print(f"Stock '{row['stock']}' not found in the Stock table")

        db.bulk_save_objects(stock_sentiments)
        db.commit()

        print("Stock sentiments updated successfully.")
    except Exception as e:
        db.rollback()
        print(f"Failed to update stock sentiments: {e}")
    finally:
        db.close()

In [29]:
add_to_db(base_articles_df)

Updating stock sentiments...
Stock sentiments updated successfully.


The second version of the algorithm using tokenization.

In [44]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))
stop_words = set(stopwords.words("english"))
articles_list = []

def delete_punctuation(article):
    pattern = r'[^\w\s]'
    article = re.sub(pattern,'',article)
    return article


def sentence_tokenization(sentence):
    article_word_list = nltk.tokenize.word_tokenize(sentence)
    return article_word_list


def article_tokenization(article):
    sentences = nltk.tokenize.sent_tokenize(article)
    return sentences


def delete_stop_words(article_list):
    global stop_words
    processed_article = [word for word in article_list if word.lower() not in stop_words]
    return processed_article


def analyze_tokenized_sentiment(tokenized_article):
    sentiment_scores = [token_analyzer.all_words(word) for word in tokenized_article if word.isalpha()]
    return sentiment_scores

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        articles_list = []
        for filename in os.listdir(folder_path):
            words_list = []
            sentences_list = []
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            article = re.sub(r'\.(?!\s)', '. ', article_text)
            sentences = article_tokenization(article_text)
            for sentence in sentences:
                sentence = delete_punctuation(sentence)
                tokenized_sentence = sentence_tokenization(sentence)
                without_base_sw = delete_stop_words(tokenized_sentence)
                words_list.append(without_base_sw)
            for word_list in words_list:
                sentence = ' '.join(word_list)
                
                sentence = sentence.capitalize()
                
                sentence = re.sub(r'([.!?])(\w)', r'\1 \2', sentence)
                sentences_list.append(sentence)
            articles_list.append(sentences_list)
            # if analyze['pos'] > analyze['neg']:
            #     pos_mark += 1
            # else:
            #     neg_mark += 1
        articles_marks.append({'stock': company_name, 'positives': pos_mark, 'negatives': neg_mark})
        break
print(articles_list)

[['Alibaba shares trading nearly 75 alltime highsalibaba walked back plans split company six standalone businessesalibaba stock trades 1144x forward earnings 10year historical pe 31as dominant ecommerce giant alibaba group holding nysebaba often considered amazon china', 'Sales thirdlargest ecommerce giant worldwith shares trading 36 yeartodate ytd also considerably underperforming nasdaq100 index 208 ytd', 'Shares trading nearly 75 alltime highs computer technology sector leader stock plenty room rebound', 'Four reasons alibaba stock may steal valuationsin 2023 alibaba announced would split business six separate entities enhance shareholder value', 'Several entities would standalone publicly traded companies shareholders would get piece new companies', 'Initially sent shares 10505 march 27 2023', 'However alibaba walk back many plansin november company squashed plans spin cloud divisions due uncertainties regarding us chip export restrictions', 'Instead company would focus growing clo