In [7]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

In [8]:
# Load the dataset
data_path = 'financialNews.csv'
df = pd.read_csv(data_path)

In [9]:
# Drop duplicate descriptions and filter out unwanted rows
df = df.drop_duplicates(subset='description', keep='first')
df = df[df['description'].str.len() >= 30]  # Keep descriptions with at least 30 characters
df = df[~df['description'].str.startswith(('By', '(Update'))]  # Exclude descriptions starting with "By" or "(Update"

In [None]:
len(df)

In [11]:
# Combine title and description into a single text for sentiment analysis
df['full_text'] = df['title'] + ". " + df['description']

In [12]:
# Initialize FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [13]:
# Function to compute sentiment scores
def compute_sentiment_scores(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
    return {
        "neutral": scores[0],
        "positive": scores[1],
        "negative": scores[2]
    }

In [None]:
# Apply sentiment analysis to each description with tqdm
tqdm.pandas(desc="Processing Sentiments")
df['sentiment_scores'] = df['full_text'].progress_apply(compute_sentiment_scores)

In [8]:
# Extract sentiment components into separate columns
df['neutral'] = df['sentiment_scores'].apply(lambda x: x['neutral'])
df['positive'] = df['sentiment_scores'].apply(lambda x: x['positive'])
df['negative'] = df['sentiment_scores'].apply(lambda x: x['negative'])

In [None]:
df.head()

In [14]:
# Group by date and calculate average sentiment scores for each day
df['datetime'] = pd.to_datetime(df['datetime'])



In [None]:
df.head()

In [16]:
daily_sentiment = df.groupby(df['datetime'].dt.date)[['neutral', 'positive', 'negative']].mean()

In [None]:
print(daily_sentiment)

In [18]:
# Reset the index to make 'date' a column
daily_sentiment = daily_sentiment.reset_index()
daily_sentiment.rename(columns={'index': 'date'}, inplace=True)

In [None]:
print(daily_sentiment)

In [20]:
# Save the daily sentiment scores to a CSV file
output_path = 'daily_sentiment_scores_both.csv'
daily_sentiment.to_csv(output_path, index=True)

In [None]:
# Print the first few rows of the result
print(daily_sentiment.head())