## Perform Sentiment Analysis

In [1]:
# !pip install transformers torch
# !pip install --upgrade ipywidgets ipykernel
# !pip install --upgrade torch
# !pip install numpy==1.24


In [2]:
# General Utilities
import time
import string
import re
import os
from collections import Counter, defaultdict
import math
import warnings

# Data Manipulation and Numerical Operations
import pandas as pd
import numpy as np

# Machine Learning - Modeling and Evaluation
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [3]:
# Load cleaned eight_k_sections_df from CSV
eight_k_sections_df = pd.read_csv('cleaned_eight_k_sections_df.csv')

In [4]:
# Preview the eight_k_sections_df
print("Eight-K Sections DataFrame:")
eight_k_sections_df.head()

Eight-K Sections DataFrame:


Unnamed: 0,ticker,disclosure_date,year,month,day,filing_type,filing_url,accession_number,section,content,Processed Disclosure Text,Token Stats
0,AAPL,2024-10-31,2024,10,31,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-24-000120,2.2,Item 2.02 Results of Operations and Financial ...,result operation financial condition. october ...,"{'Total Tokens': 49, 'Unique Tokens': 39, 'Lex..."
1,AAPL,2024-10-31,2024,10,31,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-24-000120,9.1,Item 9.01 Financial Statements and Exhibits. \...,financial statement exhibits. exhibits. exhibi...,"{'Total Tokens': 20, 'Unique Tokens': 18, 'Lex..."
2,AAPL,2024-09-10,2024,9,10,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-040659,7.1,Item 7.01 Regulation FD Disclosure.\n\nOn Augu...,regulation fd disclosure. august european anno...,"{'Total Tokens': 309, 'Unique Tokens': 194, 'L..."
3,AAPL,2024-08-26,2024,8,26,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-038601,5.2,Item 5.02 Departure of Directors or Certain Of...,departure director certain officer election di...,"{'Total Tokens': 66, 'Unique Tokens': 52, 'Lex..."
4,AAPL,2024-08-23,2024,8,23,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-038403,5.3,Item 5.03 Amendments to Articles of Incorporat...,amendment article incorporation bylaw change f...,"{'Total Tokens': 79, 'Unique Tokens': 64, 'Lex..."


In [5]:
# Suppress specific FutureWarnings if desired
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.load.*weights_only.*")
warnings.filterwarnings("ignore", category=FutureWarning, message=".*register_pytree_node.*")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Load the model directly using from_pretrained
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Define the sentiment analysis function
def analyze_finbert_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_score = predictions.detach().numpy()[0]
    return {
        'positive': sentiment_score[2],
        'neutral': sentiment_score[1],
        'negative': sentiment_score[0],
        'compound': sentiment_score[2] - sentiment_score[0]
    }

# Apply sentiment analysis to the DataFrame
eight_k_sections_df['8k_sentiment'] = eight_k_sections_df['Processed Disclosure Text'].apply(
    lambda x: analyze_finbert_sentiment(x, tokenizer, model) if isinstance(x, str) else None
)

# Extract individual sentiment scores
eight_k_sections_df['positive_sentiment'] = eight_k_sections_df['8k_sentiment'].apply(lambda x: x['positive'] if x else None)
eight_k_sections_df['neutral_sentiment'] = eight_k_sections_df['8k_sentiment'].apply(lambda x: x['neutral'] if x else None)
eight_k_sections_df['negative_sentiment'] = eight_k_sections_df['8k_sentiment'].apply(lambda x: x['negative'] if x else None)
eight_k_sections_df['compound_sentiment'] = eight_k_sections_df['8k_sentiment'].apply(lambda x: x['compound'] if x else None)

# Display the DataFrame
eight_k_sections_df.head()

Unnamed: 0,ticker,disclosure_date,year,month,day,filing_type,filing_url,accession_number,section,content,Processed Disclosure Text,Token Stats,8k_sentiment,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment
0,AAPL,2024-10-31,2024,10,31,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-24-000120,2.2,Item 2.02 Results of Operations and Financial ...,result operation financial condition. october ...,"{'Total Tokens': 49, 'Unique Tokens': 39, 'Lex...","{'positive': 0.94435006, 'neutral': 0.03616067...",0.94435,0.036161,0.019489,0.924861
1,AAPL,2024-10-31,2024,10,31,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0000320193-24-000120,9.1,Item 9.01 Financial Statements and Exhibits. \...,financial statement exhibits. exhibits. exhibi...,"{'Total Tokens': 20, 'Unique Tokens': 18, 'Lex...","{'positive': 0.9334852, 'neutral': 0.044672895...",0.933485,0.044673,0.021842,0.911643
2,AAPL,2024-09-10,2024,9,10,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-040659,7.1,Item 7.01 Regulation FD Disclosure.\n\nOn Augu...,regulation fd disclosure. august european anno...,"{'Total Tokens': 309, 'Unique Tokens': 194, 'L...","{'positive': 0.8427575, 'neutral': 0.10418083,...",0.842758,0.104181,0.053062,0.789696
3,AAPL,2024-08-26,2024,8,26,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-038601,5.2,Item 5.02 Departure of Directors or Certain Of...,departure director certain officer election di...,"{'Total Tokens': 66, 'Unique Tokens': 52, 'Lex...","{'positive': 0.9380812, 'neutral': 0.033893533...",0.938081,0.033894,0.028025,0.910056
4,AAPL,2024-08-23,2024,8,23,8-K,https://www.sec.gov/Archives/edgar/data/320193...,0001140361-24-038403,5.3,Item 5.03 Amendments to Articles of Incorporat...,amendment article incorporation bylaw change f...,"{'Total Tokens': 79, 'Unique Tokens': 64, 'Lex...","{'positive': 0.9407661, 'neutral': 0.03943291,...",0.940766,0.039433,0.019801,0.920965


In [6]:
# Group by ticker and disclosure date, then calculate the mean of sentiment scores
daily_sentiment_df = (
    eight_k_sections_df
    .groupby(['ticker', 'disclosure_date'])
    [['positive_sentiment', 'neutral_sentiment', 'negative_sentiment', 'compound_sentiment']]
    .mean()
    .reset_index()
)

# Display the resulting DataFrame
daily_sentiment_df

Unnamed: 0,ticker,disclosure_date,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment
0,AAPL,2023-02-02,0.939026,0.040366,0.020608,0.918418
1,AAPL,2023-03-10,0.900652,0.047103,0.052245,0.848408
2,AAPL,2023-05-04,0.938256,0.041305,0.020439,0.917817
3,AAPL,2023-05-10,0.937492,0.033279,0.029229,0.908264
4,AAPL,2023-08-03,0.940008,0.038987,0.021005,0.919004
...,...,...,...,...,...,...
561,XOM,2024-10-03,0.940580,0.037059,0.022361,0.918220
562,XOM,2024-10-08,0.943406,0.028649,0.027945,0.915461
563,XOM,2024-10-18,0.872158,0.106334,0.021508,0.850649
564,XOM,2024-11-01,0.920774,0.044801,0.034425,0.886348


In [None]:
# Save daily_sentiment_df to a CSV file
daily_sentiment_df.to_csv('daily_sentiment_df.csv', index=False)

# Confirm the file was saved successfully
print("Daily sentiment data saved to 'daily_sentiment_df.csv'")