# PT-BR Financial News Sentiment

1. Gather textual data
    - 1 - Test Valor Economico texts 
    - 2 - Test BDM texts 
2. Define keywords and phrases
    - Automation: How can I automate the process of selecting what is considered relevant?
3. Text preprocessing (cleaning and preparing articles)
    - Normalize textual data
4. Filter articles
    - Perform on each article: evaluate for RELEVANT SENTENCES ONLY
    - Provide "irrelevant" output for futile articles if no sentences hold relevant information
5. Sentiment analysis 
    - Attempt a multi-class classification approach 
    - 5 categories:
        - Good for USD
        - Good for BRL
        - Neutral
6. Trade signals
    - Buy USD/BRL
    - Sell USD/BRL
    - Hold

### Model 1 - BERT

In [49]:
from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from IPython.display import display

In [50]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification,
    pipeline,
)

finbert_pt_br_tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbert_pt_br_model = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

finbert_pt_br_pipeline = pipeline(task='text-classification', model=finbert_pt_br_model, tokenizer=finbert_pt_br_tokenizer)



In [51]:
def article_classification(directory, max_length=512):
    results = []
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)

        with open(path, 'r', encoding='utf-8') as fhand:
            article = fhand.read()
            tokens = finbert_pt_br_pipeline.tokenizer.encode(article, truncation=True, max_length=max_length, return_tensors='pt')
            if tokens.shape[1] > max_length:
                tokens = tokens[:, :max_length]

            truncated_text = finbert_pt_br_pipeline.tokenizer.decode(tokens[0], skip_special_tokens=True)

            sentiment = finbert_pt_br_pipeline(truncated_text)

            classification = {
                'file': os.path.basename(filename),
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            }
            results.append(classification)

    results = pd.DataFrame(results)

    return results

print(article_classification('News_Sample/andre'))

          file sentiment     score
0    File1.xml  NEGATIVE  0.791573
1   File10.xml  NEGATIVE  0.823842
2   File11.xml  POSITIVE  0.827733
3   File12.xml  NEGATIVE  0.780306
4   File13.xml   NEUTRAL  0.671244
5   File14.xml  POSITIVE  0.555569
6   File15.xml   NEUTRAL  0.551850
7   File16.xml  NEGATIVE  0.685905
8   File17.xml  POSITIVE  0.439669
9   File18.xml  NEGATIVE  0.794118
10  File19.xml  NEGATIVE  0.528298
11   File2.xml   NEUTRAL  0.536406
12   File3.xml  POSITIVE  0.371683
13   File4.xml  NEGATIVE  0.750617
14   File5.xml  POSITIVE  0.593563
15   File6.xml  POSITIVE  0.451566
16   File7.xml  NEGATIVE  0.720258
17   File8.xml  NEGATIVE  0.831076
18   File9.xml  NEGATIVE  0.578087


In [52]:
def clean_file(inputFile, outputFile):
    with open(inputFile, 'r', encoding='utf-8') as file:
        cleaned_lines = []
        
        for line in file:
            
            cleaned_line = line.replace('[', '').replace(']', '').replace('…', '').strip()
            
            if cleaned_line and cleaned_line[2] == '/' and cleaned_line[5] == '/':
                if cleaned_lines: 
                    cleaned_lines.append('')  

            if cleaned_line:
                cleaned_lines.append(cleaned_line)


    with open(outputFile, 'w', encoding='utf-8') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

clean_file('BDM_News_Corpus.txt', 'Clean_BDM_News_Corpus.txt')

### Model 3 - Word2Vec

In [53]:
'''
Plan: Add column for vector label (+1 good for Real, -1 bad for real, 0 meh)
1) when we need the dates we will have them
2) we will also have info in this prospective second column on whether the currency went up or down for each day in the file
3) we can automate this possibly with yahoo finance
we refers to me

issue: technically the labels in some code further below are in a dictionary so i'll have to figure that out
'''

import re
import pandas as pd

def extract_dates_from_file(file_path):
    date_pattern = r'\d{2}/\d{2}/\d{2}'
    dates = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        matches = re.findall(date_pattern, content)
        dates.extend(matches)
    
    dates_df = pd.DataFrame(dates, columns=['Date'])
    return dates_df

file_path = 'Clean_BDM_News_Corpus.txt'
dates_df = extract_dates_from_file(file_path)
display(dates_df)

Unnamed: 0,Date
0,09/01/24
1,10/01/24
2,11/01/24
3,12/01/24
4,15/01/24
5,16/01/24
6,19/01/24
7,22/01/24
8,23/01/24
9,24/01/24


In [54]:
import pandas as pd
import re
from datetime import datetime

# this is where we look at the file and make it into a list
def parse_articles_to_df(file_path):
    dates = []  # this is where we keep the dates
    articles = []  # this is where we keep the articles
    
    with open(file_path, "r", encoding="utf-8") as file:
        current_date = None  # this is where we keep the date we are looking at right now
        current_articles = []  # this is where we keep the article for the current date
        
        for line in file:
            line = line.strip()  # take away spaces from the start and end
            if line:  # if the line is not empty
                if line[2] == "/":  # if the line looks like a date (dd/mm/yy)
                    # we had a date before, so let's save it with its articles
                    if current_date:
                        for article in current_articles:
                            dates.append(current_date)  # add the current date
                            articles.append(article)  # add the article for that date
                    # just use the date as is, don't change it
                    current_date = line  # keep the new date
                    current_articles = []  # start fresh for the new date
                else:
                    # this is the article, we keep adding it to the list
                    current_articles.append(line) 
        
        if current_date:  # when we are done looking at the file
            for article in current_articles:  # for all the articles we saved
                dates.append(current_date)  # add the date again
                articles.append(article)  # add the article again
    
    df = pd.DataFrame({'date': dates, 'article': articles})  # make a table with the dates and articles
    return df  # give back the table

file_path = "Clean_BDM_News_Corpus.txt"  # where the file is
df_articles = parse_articles_to_df(file_path)  # call the function to make the table

# show the table to see it
display(df_articles)

Unnamed: 0,date,article
0,09/01/24,"O petróleo testava reação moderada (+0,50%) no..."
1,09/01/24,Circularam comentários de que a reunião de Pac...
2,09/01/24,"De qualquer modo, seis senadores estão com a p..."
3,09/01/24,"Nos EUA, sai a balança comercial de novembro (..."
4,09/01/24,"O investidor cumpre a espera pela 5ªF, que pro..."
...,...,...
1097,31/01/24,Emissão é de apenas uma série e já tem valor d...
1098,31/01/24,"ROMI teve lucro líquido de R$ 51,340 milhões n..."
1099,31/01/24,ENEVA. Citi manteve recomendação de compra par...
1100,31/01/24,OI. Nova versão do plano de recuperação judici...


In [55]:
import spacy

# spacy PT model
nlp = spacy.load('pt_core_news_sm')

#preprocessing
def preprocess_text_spacy(text):
    doc = nlp(text)
    
    # lemmatization and stopwords removal
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    #tokens back to 1 string
    return ' '.join(tokens)

# preprocess ALL articles in df (the file)
df_articles['processed_article'] = df_articles['article'].apply(preprocess_text_spacy)

# Display the processed articles
display(df_articles[['date', 'processed_article']].head())

Unnamed: 0,date,processed_article
0,09/01/24,petróleo testar reação moderar pregão asiático...
1,09/01/24,circularam comentário reunião Pacheco líder se...
2,09/01/24,modo senador presença confirmar Único indicado...
3,09/01/24,EUA sair balança comercial novembro Fed boy Mi...
4,09/01/24,investidor cumprir espera prometer emoção CPI ...


In [56]:
from gensim.models import Word2Vec

tokenized_articles = df_articles['processed_article'].apply(lambda x: x.split()).tolist()

model = Word2Vec(sentences=tokenized_articles, 
                 vector_size=100,   # dimensionality of the word embeddings
                 window=5,          # context window size
                 min_count=5,       # minimum frequency of words to consider
                 workers=4,         # CPUs for training
                 sg=0)              # Use CBOW (0) or Skip-Gram (1)

model.save("word2vec_brl_model.model")

In [57]:
import numpy as np

# generating article vectors based on average
def get_article_vector(article, model):
    tokens = article.split()  # Tokenize the article
    word_vectors = []
    
    for token in tokens:
        if token in model.wv:  # if word inn Word2Vec model
            word_vectors.append(model.wv[token])
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# create article vectors and add them to the df
df_articles['article_vector'] = df_articles['processed_article'].apply(lambda x: get_article_vector(x, model))

article_vectors = df_articles[['date', 'article_vector']]

#if you wanna CSV for better readability
article_vectors.to_csv('article_vectors.csv', index=False)

display(article_vectors.head())

Unnamed: 0,date,article_vector
0,09/01/24,"[-0.010041372, 0.027553359, 0.0056391084, 0.00..."
1,09/01/24,"[-0.0124345375, 0.0380081, 0.004453513, 0.0023..."
2,09/01/24,"[-0.008635919, 0.04322502, 0.009430615, -0.001..."
3,09/01/24,"[-0.015746552, 0.05247231, 0.0067171515, 0.005..."
4,09/01/24,"[-0.02147046, 0.066287816, 0.011573673, 0.0063..."


In [58]:
'''
0 for meh
+1 for up
-1 for down
'''
     
labels = {
    '09/01/24': 1, # 0.8% change
    '10/01/24': 0, # 0.2% change
    '11/01/24': -1, # 0.6% change
    '12/01/24': -1, 
    '15/01/24': 1,
    '16/01/24': 1,
    '19/01/24': 0,
    '22/01/24': 1,
    '23/01/24': -1,
    '24/01/24': -1,
    '25/01/24': -1,
    '26/01/24': -1,
    '29/01/24': 1,
    '30/01/24': 1,
    '31/01/24': 1
}

# add lablels to df
df_articles['label'] = df_articles['date'].map(labels)

In [59]:
df_articles.to_csv('article_vectors_with_labels.csv', index=False) 

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# Assuming df_articles already has processed_article and labels

# Step 1: Prepare the data (X = article vectors, y = labels)
X = np.vstack(df_articles['article_vector'].values)  # Stack article vectors into a 2D array
y = df_articles['label'].values  # Labels corresponding to the article

# Step 2: Train-test split (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a multinomial Logistic Regression classifier
clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced', C=1.0)
clf.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Classification report (precision, recall, f1-score)
classification_rep = classification_report(y_test, y_pred, output_dict=True)  # Get classification report as a dictionary

# Confusion matrix (Actual vs Predicted classes)
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert the evaluation results into clean dataframes for display
classification_rep_df = pd.DataFrame(classification_rep).transpose()  # Convert classification report to dataframe
conf_matrix_df = pd.DataFrame(conf_matrix, 
                               index=["Down", "Neutral", "Up"], 
                               columns=["Down", "Neutral", "Up"])  # Confusion matrix as dataframe

# Step 6: Display the results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep_df)
print("\nConfusion Matrix:\n", conf_matrix_df)

# Step 7: Predicting labels for all articles
y_pred_all = clf.predict(X)

# Assign predictions to the DataFrame
df_articles['predicted_label'] = y_pred_all

# Display the articles with their predicted labels
df_articles.to_csv('article_vectors_with_labels.csv', index=False) 



Accuracy: 0.3393665158371041

Classification Report:
               precision    recall  f1-score     support
-1             0.666667  0.043011  0.080808   93.000000
0              0.142857  0.464286  0.218487   28.000000
1              0.467742  0.580000  0.517857  100.000000
accuracy       0.339367  0.339367  0.339367    0.339367
macro avg      0.425755  0.362432  0.272384  221.000000
weighted avg   0.510290  0.339367  0.296011  221.000000

Confusion Matrix:
          Down  Neutral  Up
Down        4       38  51
Neutral     0       13  15
Up          2       40  58
