In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

In [2]:
# Load the dataset
data_path = 'financialNews.csv'
df = pd.read_csv(data_path)

In [3]:
# Drop duplicate descriptions and filter out unwanted rows
df = df.drop_duplicates(subset='description', keep='first')
df = df[df['description'].str.len() >= 30]  # Keep descriptions with at least 30 characters
df = df[~df['description'].str.startswith(('By', '(Update'))]  # Exclude descriptions starting with "By" or "(Update"

In [4]:
len(df)

72165

In [5]:
# Initialize FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [6]:
# Function to compute sentiment scores
def compute_sentiment_scores(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
    return {
        "neutral": scores[0],
        "positive": scores[1],
        "negative": scores[2]
    }

In [7]:
# Apply sentiment analysis to each description with tqdm
tqdm.pandas(desc="Processing Sentiments")
df['sentiment_scores'] = df['description'].progress_apply(compute_sentiment_scores)

Processing Sentiments: 100%|██████████| 72165/72165 [2:24:39<00:00,  8.31it/s]  


In [8]:
# Extract sentiment components into separate columns
df['neutral'] = df['sentiment_scores'].apply(lambda x: x['neutral'])
df['positive'] = df['sentiment_scores'].apply(lambda x: x['positive'])
df['negative'] = df['sentiment_scores'].apply(lambda x: x['negative'])

In [13]:
df.head()

Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct,sentiment_scores,neutral,positive,negative
0,2023-11-24 00:00:00,OpenAI turmoil exposes threat to Microsoft’s i...,Microsoft chief executive Satya Nadella’s deci...,MSFT,Microsoft Corporation,Technology,Software—Infrastructure,-0.500163,"{'neutral': 1.2681854e-06, 'positive': 0.99999...",1.268185e-06,0.9999974,1.340741e-06
1,2023-11-23 19:00:00,10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.592448,"{'neutral': 0.9999901, 'positive': 3.301402e-0...",0.9999901,3.301402e-07,9.529862e-06
2,2023-11-23 17:43:00,UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.825589,"{'neutral': 2.7610524e-06, 'positive': 0.99999...",2.761052e-06,0.9999967,6.156512e-07
3,2023-11-23 16:47:00,Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers - General,0.323559,"{'neutral': 0.18199356, 'positive': 0.00074971...",0.1819936,0.0007497176,0.8172567
4,2023-11-23 15:25:00,EU mulls wider scope for cybersecurity certifi...,The European Union is considering broadening t...,GOOGL,Alphabet Inc.,Communication Services,Internet Content & Information,-1.4296,"{'neutral': 1.5444941e-07, 'positive': 0.99999...",1.544494e-07,0.9999999,4.542047e-08


In [14]:
# Group by date and calculate average sentiment scores for each day
df['datetime'] = pd.to_datetime(df['datetime'])



In [15]:
df.head()

Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct,sentiment_scores,neutral,positive,negative
0,2023-11-24 00:00:00,OpenAI turmoil exposes threat to Microsoft’s i...,Microsoft chief executive Satya Nadella’s deci...,MSFT,Microsoft Corporation,Technology,Software—Infrastructure,-0.500163,"{'neutral': 1.2681854e-06, 'positive': 0.99999...",1.268185e-06,0.9999974,1.340741e-06
1,2023-11-23 19:00:00,10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.592448,"{'neutral': 0.9999901, 'positive': 3.301402e-0...",0.9999901,3.301402e-07,9.529862e-06
2,2023-11-23 17:43:00,UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.825589,"{'neutral': 2.7610524e-06, 'positive': 0.99999...",2.761052e-06,0.9999967,6.156512e-07
3,2023-11-23 16:47:00,Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers - General,0.323559,"{'neutral': 0.18199356, 'positive': 0.00074971...",0.1819936,0.0007497176,0.8172567
4,2023-11-23 15:25:00,EU mulls wider scope for cybersecurity certifi...,The European Union is considering broadening t...,GOOGL,Alphabet Inc.,Communication Services,Internet Content & Information,-1.4296,"{'neutral': 1.5444941e-07, 'positive': 0.99999...",1.544494e-07,0.9999999,4.542047e-08


In [16]:
daily_sentiment = df.groupby(df['datetime'].dt.date)[['neutral', 'positive', 'negative']].mean()

In [17]:
print(daily_sentiment)

             neutral  positive  negative
datetime                                
2022-05-02  0.455935  0.202893  0.341171
2022-05-03  0.528152  0.319844  0.152004
2022-05-04  0.489388  0.370827  0.139785
2022-05-05  0.556980  0.285677  0.157343
2022-05-06  0.578342  0.254266  0.167392
...              ...       ...       ...
2023-11-20  0.597135  0.261245  0.141620
2023-11-21  0.429709  0.352886  0.217406
2023-11-22  0.392676  0.329357  0.277967
2023-11-23  0.539126  0.246439  0.214435
2023-11-24  0.000001  0.999997  0.000001

[562 rows x 3 columns]


In [18]:
# Reset the index to make 'date' a column
daily_sentiment = daily_sentiment.reset_index()
daily_sentiment.rename(columns={'index': 'date'}, inplace=True)

In [19]:
print(daily_sentiment)

       datetime   neutral  positive  negative
0    2022-05-02  0.455935  0.202893  0.341171
1    2022-05-03  0.528152  0.319844  0.152004
2    2022-05-04  0.489388  0.370827  0.139785
3    2022-05-05  0.556980  0.285677  0.157343
4    2022-05-06  0.578342  0.254266  0.167392
..          ...       ...       ...       ...
557  2023-11-20  0.597135  0.261245  0.141620
558  2023-11-21  0.429709  0.352886  0.217406
559  2023-11-22  0.392676  0.329357  0.277967
560  2023-11-23  0.539126  0.246439  0.214435
561  2023-11-24  0.000001  0.999997  0.000001

[562 rows x 4 columns]


In [20]:
# Save the daily sentiment scores to a CSV file
output_path = 'daily_sentiment_scores.csv'
daily_sentiment.to_csv(output_path, index=True)