In [1]:
from googletrans import Translator
translator = Translator()


In [2]:
translator.translate(text = "olá. como vai vc?", dest = 'en', src = 'pt').text

'Hi.how are you?'

In [3]:
from nltk.sentiment import SentimentAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [4]:
analyzer = SentimentIntensityAnalyzer()

In [7]:
analyzer.polarity_scores("hi, i am not happy right now")

{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'compound': -0.4585}

In [10]:
analyzer.polarity_scores("GME prices are going very high")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [11]:
analyzer.polarity_scores("We maybe have found some good opportunities to invest ina FII")

{'neg': 0.0, 'neu': 0.621, 'pos': 0.379, 'compound': 0.6705}

In [13]:
analyzer.polarity_scores(translator.translate(text = "Temos visto uma crescente oportunidade para investir em FIIs", dest = 'en', src = 'pt').text)

{'neg': 0.0, 'neu': 0.64, 'pos': 0.36, 'compound': 0.5423}

In [14]:
analyzer.polarity_scores(translator.translate(text = "Queda recente no valuation de FIIs nos faz desistir da ideia", dest = 'en', src = 'pt').text)


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [18]:
analyzer.polarity_scores(translator.translate(text = "Queda recente nos FIIs nos faz não investir nela", dest = 'en', src = 'pt').text)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [26]:
analyzer.polarity_scores(translator.translate(text = "Esses fundos de investimento possuem péssimo desempenho", dest = 'en', src = 'pt').text)

{'neg': 0.383, 'neu': 0.617, 'pos': 0.0, 'compound': -0.4767}

In [25]:
translator.translate(text = "Esses fundos de investimento possuem péssimo desempenho", dest = 'en', src = 'pt').text

'These investment funds have poor performance'

In [27]:
analyzer.polarity_scores(translator.translate(text = "Os preços das ações de PETR4 tem caído significativamente", dest = 'en', src = 'pt').text)

{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'compound': -0.3612}

### other translation tools 
* https://towardsdatascience.com/translate-any-two-languages-in-60-lines-of-python-b54dc4a9e739
* https://fasttext.cc/


## Translation / Sentiment with Hugging Face pretrained models

In [33]:
from transformers import MarianTokenizer, MarianMTModel

translation_model_name = f'Helsinki-NLP/opus-mt-roa-en'
model = MarianMTModel.from_pretrained(translation_model_name)
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)

In [15]:
model.save_pretrained(translation_model_name)
tokenizer.save_pretrained(translation_model_name)

('Helsinki-NLP/opus-mt-roa-en\\tokenizer_config.json',
 'Helsinki-NLP/opus-mt-roa-en\\special_tokens_map.json',
 WindowsPath('Helsinki-NLP/opus-mt-roa-en/source_spm'),
 WindowsPath('Helsinki-NLP/opus-mt-roa-en/target_spm'),
 WindowsPath('Helsinki-NLP/opus-mt-roa-en/vocab'),
 WindowsPath('Helsinki-NLP/opus-mt-roa-en/tokenizer_config_file'),
 'Helsinki-NLP/opus-mt-roa-en\\added_tokens.json')

In [34]:
# Translate the text
inputs = tokenizer(["🛢️ Petrobras (#PETR4) levanta R$ 11,3 bilhões com privatização da BR Distribuidora (#BRDT3).\n\n#economia #mercado #Brasil #notícias"], return_tensors="pt", padding=True)
gen = model.generate(**inputs)
tokenizer.batch_decode(gen, skip_special_tokens=True)

['Petrobras (#PETR4) raises R$ 11.3 billion with privatization of the BR Distribuidora (#BRDT3). #economy #market #Brazil #news']

In [2]:
from transformers import pipeline

In [35]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)



In [5]:
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# tokenizer.save_pretrained(MODEL)

In [38]:
text = ['Petrobras (#PETR4) raises R$ 11.3 billion with privatization of the BR Distribuidora (#BRDT3). #economy #market #Brazil #news']
text = [preprocess(x) for x in text]
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0].detach().numpy()
scores = softmax(scores, axis = 1)
scores

array([[0.01297673, 0.83843815, 0.14858511]], dtype=float32)

In [30]:
softmax(output[0].detach().numpy(), axis = 1)

array([[0.96737456, 0.02879267, 0.00383274],
       [0.00228572, 0.01300936, 0.9847049 ]], dtype=float32)

In [27]:
softmax(output[0][0].detach().numpy())

array([0.96737456, 0.02879267, 0.00383274], dtype=float32)

In [28]:
softmax(output[0][1].detach().numpy())

array([0.00228572, 0.01300936, 0.9847049 ], dtype=float32)

In [18]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")


Downloading: 100%|██████████| 687/687 [00:00<00:00, 229kB/s]
Downloading: 100%|██████████| 1.42G/1.42G [02:28<00:00, 9.57MB/s]
Downloading: 100%|██████████| 256/256 [00:00<00:00, 85.4kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 1.70MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.49MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 156kB/s]


In [21]:
print(sentiment_analysis("not that good"))


[{'label': 'NEGATIVE', 'score': 0.9994285702705383}]


### some portuguese sentiment models to test: 
* https://github.com/rdenadai/sentiment-analysis-2018-president-election
* https://github.com/acardosoj/LeIA