# PT-BR Financial News Sentiment

## Model for Clean Articles

In [4]:
from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd
from bs4 import BeautifulSoup
import re

In [5]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification,
    pipeline,
)

finbert_pt_br_tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbert_pt_br_model = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

finbert_pt_br_pipeline = pipeline(task='text-classification', model=finbert_pt_br_model, tokenizer=finbert_pt_br_tokenizer)

In [6]:
def article_classification(directory, max_length=512):
    results = []
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)

        with open(path, 'r', encoding='utf-8') as fhand:
            article = fhand.read()
            tokens = finbert_pt_br_pipeline.tokenizer.encode(article, truncation=True, max_length=max_length, return_tensors='pt')
            if tokens.shape[1] > max_length:
                tokens = tokens[:, :max_length]

            truncated_text = finbert_pt_br_pipeline.tokenizer.decode(tokens[0], skip_special_tokens=True)

            sentiment = finbert_pt_br_pipeline(truncated_text)

            classification = {
                'file': os.path.basename(filename),
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            }
            results.append(classification)

    results = pd.DataFrame(results)

    return results

print(article_classification('News_Sample/andre'))

          file sentiment     score
0    File1.xml  NEGATIVE  0.791573
1   File10.xml  NEGATIVE  0.823842
2   File11.xml  POSITIVE  0.827733
3   File12.xml  NEGATIVE  0.780306
4   File13.xml   NEUTRAL  0.671244
5   File14.xml  POSITIVE  0.555569
6   File15.xml   NEUTRAL  0.551850
7   File16.xml  NEGATIVE  0.685905
8   File17.xml  POSITIVE  0.439669
9   File18.xml  NEGATIVE  0.794118
10  File19.xml  NEGATIVE  0.528298
11   File2.xml   NEUTRAL  0.536406
12   File3.xml  POSITIVE  0.371683
13   File4.xml  NEGATIVE  0.750617
14   File5.xml  POSITIVE  0.593563
15   File6.xml  POSITIVE  0.451566
16   File7.xml  NEGATIVE  0.720258
17   File8.xml  NEGATIVE  0.831076
18   File9.xml  NEGATIVE  0.578087


## Pipeline

1. Gather textual data
    - 1 - Test Valor Economico texts 
    - 2 - Test BDM texts 
2. Define Keywords/Phrases
    - Automation: How can I automate the process of selecting what is considered relevant?
3. Text preprocessing (cleaning and preparing articles)
    - Normalize textual data
4. Filter articles
    - Perform on each article: evaluate for RELEVANT SENTENCES ONLY
    - Provide "irrelevant" output for futile articles if no sentences hold relevant information
5. Sentiment analysis
6. Trade signals

### Step 3 - Cleaning BDM News

In [10]:
#Manually converted .docx format to .txt

def clean_file(inputFile, outputFile):
    with open(inputFile, 'r', encoding='utf-8') as file:
        cleaned_lines = []
        
        for line in file:
            
            cleaned_line = line.replace('[', '').replace(']', '').replace('…', '').strip()
            
            if cleaned_line and cleaned_line[2] == '/' and cleaned_line[5] == '/':
                if cleaned_lines: 
                    cleaned_lines.append('')  

            if cleaned_line:
                cleaned_lines.append(cleaned_line)


    with open(outputFile, 'w', encoding='utf-8') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

clean_file('BDM_News_Text.txt', 'cleaned_output.txt')

### Keyword/Phrase Definitions

- Filter by KEEPING relevant articles
- Select articles with RELEVANT keywords (macroeconomic conditions, interest rates, good/bad for USD or BRL)

Economic Indicators:

Keywords:

# Trading Signal Generation