# PT-BR Financial News Sentiment

1. Gather textual data
    - 1 - Test Valor Economico texts 
    - 2 - Test BDM texts 
2. Define keywords and phrases
    - Automation: How can I automate the process of selecting what is considered relevant?
3. Text preprocessing (cleaning and preparing articles)
    - Normalize textual data
4. Filter articles
    - Perform on each article: evaluate for RELEVANT SENTENCES ONLY
    - Provide "irrelevant" output for futile articles if no sentences hold relevant information
5. Sentiment analysis 
    - Attempt a multi-class classification approach 
    - 5 categories:
        - Good for USD
        - Good for BRL
        - Neutral
6. Trade signals
    - Buy USD/BRL
    - Sell USD/BRL
    - Hold

### Model 1 - BERT

In [9]:
from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from IPython.display import display

In [10]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification,
    pipeline,
)

finbert_pt_br_tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbert_pt_br_model = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

finbert_pt_br_pipeline = pipeline(task='text-classification', model=finbert_pt_br_model, tokenizer=finbert_pt_br_tokenizer)

In [11]:
def article_classification(directory, max_length=512):
    results = []
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)

        with open(path, 'r', encoding='utf-8') as fhand:
            article = fhand.read()
            tokens = finbert_pt_br_pipeline.tokenizer.encode(article, truncation=True, max_length=max_length, return_tensors='pt')
            if tokens.shape[1] > max_length:
                tokens = tokens[:, :max_length]

            truncated_text = finbert_pt_br_pipeline.tokenizer.decode(tokens[0], skip_special_tokens=True)

            sentiment = finbert_pt_br_pipeline(truncated_text)

            classification = {
                'file': os.path.basename(filename),
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            }
            results.append(classification)

    results = pd.DataFrame(results)

    return results

print(article_classification('News_Sample/andre'))

          file sentiment     score
0    File1.xml  NEGATIVE  0.791573
1   File10.xml  NEGATIVE  0.823842
2   File11.xml  POSITIVE  0.827733
3   File12.xml  NEGATIVE  0.780306
4   File13.xml   NEUTRAL  0.671244
5   File14.xml  POSITIVE  0.555569
6   File15.xml   NEUTRAL  0.551850
7   File16.xml  NEGATIVE  0.685905
8   File17.xml  POSITIVE  0.439669
9   File18.xml  NEGATIVE  0.794118
10  File19.xml  NEGATIVE  0.528298
11   File2.xml   NEUTRAL  0.536406
12   File3.xml  POSITIVE  0.371683
13   File4.xml  NEGATIVE  0.750617
14   File5.xml  POSITIVE  0.593563
15   File6.xml  POSITIVE  0.451566
16   File7.xml  NEGATIVE  0.720258
17   File8.xml  NEGATIVE  0.831076
18   File9.xml  NEGATIVE  0.578087


In [9]:
def clean_file(inputFile, outputFile):
    with open(inputFile, 'r', encoding='utf-8') as file:
        cleaned_lines = []
        
        for line in file:
            
            cleaned_line = line.replace('[', '').replace(']', '').replace('…', '').strip()
            
            if cleaned_line and cleaned_line[2] == '/' and cleaned_line[5] == '/':
                if cleaned_lines: 
                    cleaned_lines.append('')  

            if cleaned_line:
                cleaned_lines.append(cleaned_line)


    with open(outputFile, 'w', encoding='utf-8') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

clean_file('BDM_News_Corpus.txt', 'Clean_BDM_News_Corpus.txt')

### Model 3 - Word2Vec

In [None]:
#preprocessing and cleaning

import pandas as pd
import re
from datetime import datetime

# Function to parse the text file and return a Pandas DataFrame with date format conversion
def parse_articles_to_df(file_path):
    dates = []
    articles = []
    
    with open(file_path, "r", encoding="utf-8") as file:
        current_date = None
        current_articles = []
        
        for line in file:
            line = line.strip()
            if line:  # Ignore empty lines
                if line[2] == "/":  # Date format starts with dd/mm/yy
                    # If we find a date, save the previous articles
                    if current_date:
                        for article in current_articles:
                            dates.append(current_date)
                            articles.append(article)
                    # Convert date to MM/DD/YY format (from DD/MM/YY)
                    current_date = convert_brazilian_to_american_date(line)
                    current_articles = []
                else:
                    # Add the article text under the current date
                    current_articles.append(line)
        
        # Save the last group of articles
        if current_date:
            for article in current_articles:
                dates.append(current_date)
                articles.append(article)
    
    # Create a DataFrame
    df = pd.DataFrame({'date': dates, 'article': articles})
    
    return df

# Function to convert Brazilian date format (DD/MM/YY) to American format (MM/DD/YY)
def convert_brazilian_to_american_date(date_str):
    # Parse the Brazilian date (DD/MM/YY)
    date_obj = datetime.strptime(date_str, "%d/%m/%y")
    
    # Convert to American date format (MM/DD/YY)
    return date_obj.strftime("%m/%d/%y")

# Example usage
file_path = "Clean_BDM_News_Corpus.txt"
df_articles = parse_articles_to_df(file_path)

# Display first few rows of the DataFrame to check
display(df_articles.head())

Unnamed: 0,date,article
0,01/09/24,"O petróleo testava reação moderada (+0,50%) no..."
1,01/09/24,Circularam comentários de que a reunião de Pac...
2,01/09/24,"De qualquer modo, seis senadores estão com a p..."
3,01/09/24,"Nos EUA, sai a balança comercial de novembro (..."
4,01/09/24,"O investidor cumpre a espera pela 5ªF, que pro..."


In [22]:
import spacy

# Load the Portuguese model for spaCy
nlp = spacy.load('pt_core_news_sm')

# Preprocessing function using spaCy
def preprocess_text_spacy(text):
    # Process the text using spaCy
    doc = nlp(text)
    
    # Lemmatize and remove stopwords and non-alphabetic tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to all articles in the DataFrame
df_articles['processed_article'] = df_articles['article'].apply(preprocess_text_spacy)

# Display the processed articles
display(df_articles[['date', 'processed_article']].head())

Unnamed: 0,date,processed_article
0,01/09/24,petróleo testar reação moderar pregão asiático...
1,01/09/24,circularam comentário reunião Pacheco líder se...
2,01/09/24,modo senador presença confirmar Único indicado...
3,01/09/24,EUA sair balança comercial novembro Fed boy Mi...
4,01/09/24,investidor cumprir espera prometer emoção CPI ...


In [23]:
from gensim.models import Word2Vec

# Tokenize all the processed articles into a list of tokenized words
tokenized_articles = df_articles['processed_article'].apply(lambda x: x.split()).tolist()

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_articles, 
                 vector_size=100,   # Dimensionality of the word embeddings
                 window=5,          # Context window size
                 min_count=5,       # Minimum frequency of words to consider
                 workers=4,         # Number of threads for training
                 sg=0)              # Use CBOW (0) or Skip-Gram (1)

# Save the model for later use
model.save("word2vec_brl_model.model")

In [27]:
import numpy as np

# Function to generate an article vector by averaging the word vectors
def get_article_vector(article, model):
    tokens = article.split()  # Tokenize the article
    word_vectors = []
    
    for token in tokens:
        if token in model.wv:  # Check if the word exists in the Word2Vec model
            word_vectors.append(model.wv[token])
    
    # If we have word vectors, return their average; otherwise, return a zero vector
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Generate article vectors and add them to the DataFrame
df_articles['article_vector'] = df_articles['processed_article'].apply(lambda x: get_article_vector(x, model))

# Display the DataFrame with article vectors
article_vectors = df_articles[['date', 'article_vector']]
article_vectors.to_csv('article_vectors.csv', index=False)

In [None]:
labels = {
    "01/09/24": 1,  # +1 for up
    "01/10/24": -1, # -1 for down
    "01/15/24": 0   # 0 for meh
}

# Add labels to the DataFrame
df_articles['label'] = df_articles['date'].map(labels)

# Display the DataFrame with labels
print(df_articles[['date', 'article', 'label']].head())