In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

In [2]:
# Load the dataset
data_path = 'financialNews.csv'
df = pd.read_csv(data_path)

In [3]:
# Drop duplicate descriptions and filter out unwanted rows
df = df.drop_duplicates(subset='description', keep='first')
df = df[df['description'].str.len() >= 30]  # Keep descriptions with at least 30 characters
df = df[~df['description'].str.startswith(('By', '(Update'))]  # Exclude descriptions starting with "By" or "(Update"

In [4]:
len(df)

72165

In [5]:
# Combine title and description into a single text for sentiment analysis
df['full_text'] = df['title'] + ". " + df['description']

In [6]:
# Initialize FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [7]:
# Function to compute sentiment scores
def compute_sentiment_scores(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
    return {
        "neutral": scores[0],
        "positive": scores[1],
        "negative": scores[2]
    }

In [8]:
# Apply sentiment analysis to each description with tqdm
tqdm.pandas(desc="Processing Sentiments")
df['sentiment_scores'] = df['full_text'].progress_apply(compute_sentiment_scores)

Processing Sentiments: 100%|██████████| 72165/72165 [2:40:28<00:00,  7.50it/s]  


In [9]:
# Extract sentiment components into separate columns
df['neutral'] = df['sentiment_scores'].apply(lambda x: x['neutral'])
df['positive'] = df['sentiment_scores'].apply(lambda x: x['positive'])
df['negative'] = df['sentiment_scores'].apply(lambda x: x['negative'])

In [10]:
df.head()

Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct,full_text,sentiment_scores,neutral,positive,negative
0,"Fri 24 Nov 2023, 12:00AM",OpenAI turmoil exposes threat to Microsoft’s i...,Microsoft chief executive Satya Nadella’s deci...,MSFT,Microsoft Corporation,Technology,Software—Infrastructure,-0.500163,OpenAI turmoil exposes threat to Microsoft’s i...,"{'neutral': 0.00075857545, 'positive': 0.99884...",0.0007585755,0.998844,0.0003971171
1,"Thu 23 Nov 2023, 07:00PM",10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.592448,10 Can’t Miss Black Friday Electronics Deals a...,"{'neutral': 0.9996842, 'positive': 2.8879558e-...",0.9996842,2.9e-05,0.000286953
2,"Thu 23 Nov 2023, 05:43PM",UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.825589,UPDATE 1-German union Verdi calls for strikes ...,"{'neutral': 1.4565956e-05, 'positive': 0.99998...",1.456596e-05,0.999985,5.540053e-07
3,"Thu 23 Nov 2023, 04:47PM",Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers - General,0.323559,Corrections & Amplifications - The success of ...,"{'neutral': 0.6926845, 'positive': 0.029702397...",0.6926845,0.029702,0.2776131
4,"Thu 23 Nov 2023, 03:25PM",EU mulls wider scope for cybersecurity certifi...,The European Union is considering broadening t...,GOOGL,Alphabet Inc.,Communication Services,Internet Content & Information,-1.4296,EU mulls wider scope for cybersecurity certifi...,"{'neutral': 6.0717076e-07, 'positive': 0.99999...",6.071708e-07,0.999999,6.300474e-08


In [11]:
# Group by date and calculate average sentiment scores for each day
df['datetime'] = pd.to_datetime(df['datetime'])



  df['datetime'] = pd.to_datetime(df['datetime'])


In [12]:
df.head()

Unnamed: 0,datetime,title,description,ticker,company,sector,industry,change_pct,full_text,sentiment_scores,neutral,positive,negative
0,2023-11-24 00:00:00,OpenAI turmoil exposes threat to Microsoft’s i...,Microsoft chief executive Satya Nadella’s deci...,MSFT,Microsoft Corporation,Technology,Software—Infrastructure,-0.500163,OpenAI turmoil exposes threat to Microsoft’s i...,"{'neutral': 0.00075857545, 'positive': 0.99884...",0.0007585755,0.998844,0.0003971171
1,2023-11-23 19:00:00,10 Can’t Miss Black Friday Electronics Deals a...,The biggest shopping day of the season is upon...,COST,Costco Wholesale Corporation,Consumer Defensive,Discount Stores,0.592448,10 Can’t Miss Black Friday Electronics Deals a...,"{'neutral': 0.9996842, 'positive': 2.8879558e-...",0.9996842,2.9e-05,0.000286953
2,2023-11-23 17:43:00,UPDATE 1-German union Verdi calls for strikes ...,German trade union Verdi has called on members...,AMZN,"Amazon.com, Inc.",Consumer Cyclical,Internet Retail,-0.825589,UPDATE 1-German union Verdi calls for strikes ...,"{'neutral': 1.4565956e-05, 'positive': 0.99998...",1.456596e-05,0.999985,5.540053e-07
3,2023-11-23 16:47:00,Corrections & Amplifications - The success of ...,The success of blood thinners being developed ...,BMY,Bristol-Myers Squibb Company,Healthcare,Drug Manufacturers - General,0.323559,Corrections & Amplifications - The success of ...,"{'neutral': 0.6926845, 'positive': 0.029702397...",0.6926845,0.029702,0.2776131
4,2023-11-23 15:25:00,EU mulls wider scope for cybersecurity certifi...,The European Union is considering broadening t...,GOOGL,Alphabet Inc.,Communication Services,Internet Content & Information,-1.4296,EU mulls wider scope for cybersecurity certifi...,"{'neutral': 6.0717076e-07, 'positive': 0.99999...",6.071708e-07,0.999999,6.300474e-08


In [13]:
daily_sentiment = df.groupby(df['datetime'].dt.date)[['neutral', 'positive', 'negative']].mean()

In [14]:
print(daily_sentiment)

             neutral  positive  negative
datetime                                
2022-05-02  0.424826  0.219777  0.355397
2022-05-03  0.501537  0.310365  0.188098
2022-05-04  0.474070  0.376741  0.149190
2022-05-05  0.519249  0.297318  0.183434
2022-05-06  0.474621  0.327590  0.197788
...              ...       ...       ...
2023-11-20  0.513478  0.343111  0.143411
2023-11-21  0.431269  0.343094  0.225637
2023-11-22  0.343652  0.375047  0.281301
2023-11-23  0.529288  0.245217  0.225494
2023-11-24  0.000759  0.998844  0.000397

[562 rows x 3 columns]


In [15]:
# Reset the index to make 'date' a column
daily_sentiment = daily_sentiment.reset_index()
daily_sentiment.rename(columns={'index': 'date'}, inplace=True)

In [16]:
print(daily_sentiment)

       datetime   neutral  positive  negative
0    2022-05-02  0.424826  0.219777  0.355397
1    2022-05-03  0.501537  0.310365  0.188098
2    2022-05-04  0.474070  0.376741  0.149190
3    2022-05-05  0.519249  0.297318  0.183434
4    2022-05-06  0.474621  0.327590  0.197788
..          ...       ...       ...       ...
557  2023-11-20  0.513478  0.343111  0.143411
558  2023-11-21  0.431269  0.343094  0.225637
559  2023-11-22  0.343652  0.375047  0.281301
560  2023-11-23  0.529288  0.245217  0.225494
561  2023-11-24  0.000759  0.998844  0.000397

[562 rows x 4 columns]


In [17]:
# Save the daily sentiment scores to a CSV file
output_path = 'daily_sentiment_scores_both.csv'
daily_sentiment.to_csv(output_path, index=True)

In [18]:
# Print the first few rows of the result
print(daily_sentiment.head())

     datetime   neutral  positive  negative
0  2022-05-02  0.424826  0.219777  0.355397
1  2022-05-03  0.501537  0.310365  0.188098
2  2022-05-04  0.474070  0.376741  0.149190
3  2022-05-05  0.519249  0.297318  0.183434
4  2022-05-06  0.474621  0.327590  0.197788
