In [18]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

In [9]:
# Load the dataset
file_path = 'financialNews.csv'
df = pd.read_csv(file_path)

In [10]:
# Step 1: Remove duplicate articles
df = df.drop_duplicates(subset='description', keep='first')

In [11]:
# Step 2: Filter descriptions
df = df[~df['description'].str.startswith("By")]  # Exclude rows where description starts with "By"
df = df[~df['description'].str.startswith("(Update")]
df = df[df['description'].str.len() >=30]  # Exclude rows with descriptions shorter than 40 characters

In [12]:
# Step 3: Reset index for easier processing
df.reset_index(drop=True, inplace=True)

In [14]:
# Step 4: Initialize FinBERT sentiment analysis pipeline
finbert = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", tokenizer="yiyanghkust/finbert-tone")

In [23]:
# Step 5: Apply sentiment analysis in batches (modified)
def classify_sentiment_batch(descriptions):
    results = finbert(list(descriptions), truncation=True, max_length=512)  # Process batch as a list, truncate to 512 tokens
    return [result['label'] for result in results]

In [24]:
# Set batch size
batch_size = 16  # Adjust based on memory availability and performance

In [25]:
# Apply batch processing
sentiments = []
for i in tqdm(range(0, len(df), batch_size), desc="Processing Batches"):
    batch = df['description'][i:i+batch_size]
    sentiments.extend(classify_sentiment_batch(batch))

Processing Batches: 100%|██████████| 4511/4511 [6:25:19<00:00,  5.13s/it]


In [26]:
# Add sentiment results to the dataframe
df['sentiment'] = sentiments

In [27]:
# Step 6: Save the results
output_file = 'sentiment_analysis_results.csv'
df.to_csv(output_file, index=False)

print(f"Sentiment analysis completed. Results saved to {output_file}")

Sentiment analysis completed. Results saved to sentiment_analysis_results.csv
