<a href="https://colab.research.google.com/github/DMadhumita2904/BERT-Based-Sentiment-Analysis-for-Financial-News/blob/main/BERT_Based_Sentiment_Analysis_for_Financial_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas transformers torch scikit-learn




In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the dataset
file_path = '/content/data.csv'  # Replace with the actual path to your dataset
df = pd.read_csv(file_path)

# Function to clean the text data
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower().strip()
    return text

# Apply the clean_text function to the 'Sentence' column
df['Cleaned_Sentence'] = df['Sentence'].apply(clean_text)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Cleaned_Sentence'], df['Sentiment'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Tokenize the text data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Convert the data into PyTorch tensors
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, y_train_encoded)
test_dataset = NewsDataset(test_encodings, y_test_encoded)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained('./sentiment-analysis-model')
tokenizer.save_pretrained('./sentiment-analysis-model')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3971,0.499512
2,0.4867,0.551287
3,0.2413,0.541037


('./sentiment-analysis-model/tokenizer_config.json',
 './sentiment-analysis-model/special_tokens_map.json',
 './sentiment-analysis-model/vocab.txt',
 './sentiment-analysis-model/added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./sentiment-analysis-model')
tokenizer = BertTokenizer.from_pretrained('./sentiment-analysis-model')

# Example sentence for sentiment prediction
sentence = "Tesla is recalling 2,700 Model X cars: https://t.co/8Z7BkVsTl9 $TSLA"

# Tokenize and predict
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
outputs = model(**inputs)

# Get predicted sentiment
predicted_label = torch.argmax(outputs.logits, dim=1).item()
predicted_sentiment = label_encoder.inverse_transform([predicted_label])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")
