In [5]:
from transformers import pipeline
import nltk
import pandas as pd
import numpy as np
import os

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\remes\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
summarizer = pipeline("summarization", truncation=True)
analyzer = SentimentIntensityAnalyzer()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [7]:
def analyze_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment


def article_summarizer(article):
    summary = summarizer(article, max_length=300, min_length=100, do_sample=False)
    
    return summary[0]['summary_text']

def get_companies_names():
    db = next(get_db())
    try:
        stocks = db.query(Stock.title).all()
        return stocks
    finally:
        db.close()

In [23]:
articles_marks = []
companies_names = list((i[0] for i in get_companies_names()))

for root, dirs, files in os.walk(os.getcwd() + "/data/stocks"):
    for dir_name, company_name in zip(dirs, companies_names):
        folder_path = os.path.join(root, dir_name)
        pos_mark = 0
        neg_mark = 0
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            article_file = open(file_path, 'r', encoding="utf-8")
            article_text = article_file.read()
            sum_article = article_summarizer(article_text)
            analyze = analyze_sentiment(sum_article)
            if analyze['pos'] > analyze['neg']:
                pos_mark += 1
            else:
                neg_mark += 1
        articles_marks.append({'stock': company_name, 'positives': pos_mark, 'negatives': neg_mark})
        break
articles_df = pd.DataFrame(articles_marks, columns=['stock', 'positives', 'negatives'])
print(articles_df)

       stock  positives  negatives
0  Microsoft          2          4


In [28]:
from sqlalchemy import text
from database.models import Stock, StockSentiment
from database.db import get_db

def add_to_db(df):
    db = next(get_db())
    try:
        db.query(StockSentiment).delete()
        db.commit()

        db.execute(text("ALTER SEQUENCE stock_sentiments_id_seq RESTART WITH 1"))
        db.commit()
        
        print("Updating stock sentiments...")

        stock_sentiments = []
        for index, row in df.iterrows():
            stock = db.query(Stock).filter_by(title=row['stock']).first()
            if stock:
                stock_sentiment = StockSentiment(
                    stock_id=stock.id,
                    positives=row['positives'],
                    negatives=row['negatives']
                )
                stock_sentiments.append(stock_sentiment)
            else:
                print(f"Stock '{row['stock']}' not found in the Stock table")

        db.bulk_save_objects(stock_sentiments)
        db.commit()

        print("Stock sentiments updated successfully.")
    except Exception as e:
        db.rollback()
        print(f"Failed to update stock sentiments: {e}")
    finally:
        db.close()

In [29]:
add_to_db(articles_df)

Updating stock sentiments...
Stock sentiments updated successfully.
