# PT-BR Financial News Sentiment

## Model for Clean Articles

In [None]:
#Environment configuration

!pip install transformers torch numpy pandas

from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd

In [8]:
from transformers import AutoTokenizer, BertForSequenceClassification
import os
import pandas as pd

In [9]:
from transformers import (
    AutoTokenizer, 
    BertForSequenceClassification,
    pipeline,
)

finbert_pt_br_tokenizer = AutoTokenizer.from_pretrained("lucas-leme/FinBERT-PT-BR")
finbert_pt_br_model = BertForSequenceClassification.from_pretrained("lucas-leme/FinBERT-PT-BR")

finbert_pt_br_pipeline = pipeline(task='text-classification', model=finbert_pt_br_model, tokenizer=finbert_pt_br_tokenizer)



In [11]:
def article_classification(directory, max_length=512):
    results = []
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)

        with open(path, 'r', encoding='utf-8') as fhand:
            article = fhand.read()
            tokens = finbert_pt_br_pipeline.tokenizer.encode(article, truncation=True, max_length=max_length, return_tensors='pt')
            if tokens.shape[1] > max_length:
                tokens = tokens[:, :max_length]

            truncated_text = finbert_pt_br_pipeline.tokenizer.decode(tokens[0], skip_special_tokens=True)

            sentiment = finbert_pt_br_pipeline(truncated_text)

            classification = {
                'file': os.path.basename(filename),
                'sentiment': sentiment[0]['label'],
                'score': sentiment[0]['score']
            }
            results.append(classification)

    results = pd.DataFrame(results)

    return results

print(article_classification('News_Sample/andre'))

          file sentiment     score
0    File1.xml  NEGATIVE  0.791573
1   File10.xml  NEGATIVE  0.823842
2   File11.xml  POSITIVE  0.827733
3   File12.xml  NEGATIVE  0.780306
4   File13.xml   NEUTRAL  0.671244
5   File14.xml  POSITIVE  0.555569
6   File15.xml   NEUTRAL  0.551850
7   File16.xml  NEGATIVE  0.685905
8   File17.xml  POSITIVE  0.439669
9   File18.xml  NEGATIVE  0.794118
10  File19.xml  NEGATIVE  0.528298
11   File2.xml   NEUTRAL  0.536406
12   File3.xml  POSITIVE  0.371683
13   File4.xml  NEGATIVE  0.750617
14   File5.xml  POSITIVE  0.593563
15   File6.xml  POSITIVE  0.451566
16   File7.xml  NEGATIVE  0.720258
17   File8.xml  NEGATIVE  0.831076
18   File9.xml  NEGATIVE  0.578087


## Pipeline

1. Gather textual data
    - 1 - Test Valor Economico texts 
    - 2 - Test OCR Bloomberg texts 
2. Define Keywords/Phrases
    - Automation: How can I automate the process of selecting what is considered relevant?
3. Text preprocessing (cleaning and preparing articles)
    - Normalize textual data
4. Filter articles
    - Perform on each article: evaluate for RELEVANT SENTENCES ONLY
    - Provide "irrelevant" output for futile articles if no sentences hold relevant information
5. Sentiment analysis
6. Trade signals

### Keyword/Phrase Definitions

- Filter by KEEPING relevant articles
- Select articles with RELEVANT keywords (macroeconomic conditions, interest rates, good/bad for USD or BRL)

Economic Indicators:

Keywords:

In [21]:
#Filter Function

def filter_by_macro(directory): 
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)

        with open(path, 'r', encoding='utf-8') as fhand:
            #convert all text to lowercase for normalization
            article = fhand.readlines()

            keywords = [
                'economia', 'mercado', 'investimento', 'ação', 'renda',
                'taxa de juros', 'inflação', 'recessão', 'câmbio', 'moeda',
                'bolsas', 'dividendos', 'cenário econômico', 'política monetária',
                'ativos', 'passivos', 'lucro', 'perda', 'especulação', 'financiamento'
            ]

print(filter_by_macro(r"News_Sample\andre"))


<
?
x
m
l
 
v
e
r
s
i
o
n
=
"
1
.
0
"
 
e
n
c
o
d
i
n
g
=
"
u
t
f
-
8
"
?
>


<
N
o
t
i
c
i
a
 
c
o
d
i
g
o
=
"
1
a
5
b
c
a
4
2
-
e
3
1
a
-
4
3
3
6
-
8
1
b
d
-
9
1
6
b
7
a
3
c
5
0
3
c
"
>


 
 
<
C
a
t
e
g
o
r
i
a
>
<
!
[
C
D
A
T
A
[
F
i
n
a
n
ç
a
s
]
]
>
<
/
C
a
t
e
g
o
r
i
a
>


 
 
<
O
r
i
g
e
m
>
2
8
<
/
O
r
i
g
e
m
>


 
 
<
D
a
t
a
>
0
6
/
0
8
/
2
0
2
4
<
/
D
a
t
a
>


 
 
<
L
i
n
k
>
h
t
t
p
s
:
/
/
v
a
l
o
r
.
g
l
o
b
o
.
c
o
m
/
f
i
n
a
n
c
a
s
/
n
o
t
i
c
i
a
/
2
0
2
4
/
0
8
/
0
6
/
i
b
o
v
e
s
p
a
-
i
n
i
c
i
a
-
s
e
s
s
a
o
-
e
m
-
l
e
v
e
-
a
l
t
a
-
u
m
-
d
i
a
-
a
p
o
s
-
p
a
n
i
c
o
-
n
a
s
-
b
o
l
s
a
s
-
g
l
o
b
a
i
s
.
g
h
t
m
l
<
/
L
i
n
k
>


 
 
<
P
r
o
c
e
d
e
n
c
i
a
>
M
a
r
i
a
 
F
e
r
n
a
n
d
a
 
S
a
l
i
n
e
t
<
/
P
r
o
c
e
d
e
n
c
i
a
>


 
 
<
M
a
n
c
h
e
t
e
>
<
!
[
C
D
A
T
A
[
I
b
o
v
e
s
p
a
 
i
n
i
c
i
a
 
s
e
s
s
ã
o
 
e
m
 
l
e
v
e
 
a
l
t
a
 
u
m
 
d
i
a
 
a
p
ó
s
 
p
â
n
i
c
o
 
n
a
s
 
b
o
l
s
a
s
 
g
l
o
b
a
i
s
]
]
>
<
/
M
a
n
c
h
e
t
e
>


 
 
<


# Trading Signal Generation