In [1]:
from googletrans import Translator
translator = Translator()


In [2]:
translator.translate(text = "olá. como vai vc?", dest = 'en', src = 'pt').text

'Hi.how are you?'

In [3]:
from nltk.sentiment import SentimentAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [4]:
analyzer = SentimentIntensityAnalyzer()

In [7]:
analyzer.polarity_scores("hi, i am not happy right now")

{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'compound': -0.4585}

In [10]:
analyzer.polarity_scores("GME prices are going very high")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [11]:
analyzer.polarity_scores("We maybe have found some good opportunities to invest ina FII")

{'neg': 0.0, 'neu': 0.621, 'pos': 0.379, 'compound': 0.6705}

In [13]:
analyzer.polarity_scores(translator.translate(text = "Temos visto uma crescente oportunidade para investir em FIIs", dest = 'en', src = 'pt').text)

{'neg': 0.0, 'neu': 0.64, 'pos': 0.36, 'compound': 0.5423}

In [14]:
analyzer.polarity_scores(translator.translate(text = "Queda recente no valuation de FIIs nos faz desistir da ideia", dest = 'en', src = 'pt').text)


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [18]:
analyzer.polarity_scores(translator.translate(text = "Queda recente nos FIIs nos faz não investir nela", dest = 'en', src = 'pt').text)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [26]:
analyzer.polarity_scores(translator.translate(text = "Esses fundos de investimento possuem péssimo desempenho", dest = 'en', src = 'pt').text)

{'neg': 0.383, 'neu': 0.617, 'pos': 0.0, 'compound': -0.4767}

In [25]:
translator.translate(text = "Esses fundos de investimento possuem péssimo desempenho", dest = 'en', src = 'pt').text

'These investment funds have poor performance'

In [27]:
analyzer.polarity_scores(translator.translate(text = "Os preços das ações de PETR4 tem caído significativamente", dest = 'en', src = 'pt').text)

{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'compound': -0.3612}

### other translation tools 
* https://towardsdatascience.com/translate-any-two-languages-in-60-lines-of-python-b54dc4a9e739
* https://fasttext.cc/


## Translation / Sentiment with Hugging Face pretrained models

In [1]:
from transformers import MarianTokenizer, MarianMTModel

translation_model_name = f'Helsinki-NLP/opus-mt-roa-en'
model = MarianMTModel.from_pretrained(translation_model_name)
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)

Downloading: 100%|██████████| 800k/800k [00:00<00:00, 1.95MB/s]
Downloading: 100%|██████████| 779k/779k [00:00<00:00, 1.70MB/s]
Downloading: 100%|██████████| 1.46M/1.46M [00:00<00:00, 2.38MB/s]
Downloading: 100%|██████████| 265/265 [00:00<00:00, 132kB/s]


In [3]:
# Translate the text
inputs = tokenizer(["Olá, tentando traduzir essa frase"], return_tensors="pt", padding=True)
gen = model.generate(**inputs)
tokenizer.batch_decode(gen, skip_special_tokens=True)[0]

'Hello, trying to translate that phrase.'

In [4]:
from transformers import pipeline

In [5]:
senti_pipeline = pipeline("sentiment-analysis")

Downloading: 100%|██████████| 629/629 [00:00<00:00, 634kB/s]
Downloading: 100%|██████████| 268M/268M [00:24<00:00, 10.8MB/s]
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 48.1kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 995kB/s]


In [7]:
senti_pipeline("not that good")

[{'label': 'NEGATIVE', 'score': 0.99976646900177}]

In [8]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)



Downloading: 100%|██████████| 747/747 [00:00<00:00, 747kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.72MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.27MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 151kB/s]


In [10]:
# download label mapping
task = 'sentiment'
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

Downloading: 100%|██████████| 499M/499M [00:49<00:00, 10.0MB/s]


In [17]:
text = "These investment funds have poor performance"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
dict(zip(labels, scores))

{'negative': 0.9457971, 'neutral': 0.051542837, 'positive': 0.002660079}

In [18]:
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")


Downloading: 100%|██████████| 687/687 [00:00<00:00, 229kB/s]
Downloading: 100%|██████████| 1.42G/1.42G [02:28<00:00, 9.57MB/s]
Downloading: 100%|██████████| 256/256 [00:00<00:00, 85.4kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 1.70MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.49MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 156kB/s]


In [21]:
print(sentiment_analysis("not that good"))


[{'label': 'NEGATIVE', 'score': 0.9994285702705383}]


### some portuguese sentiment models to test: 
* https://github.com/rdenadai/sentiment-analysis-2018-president-election
* https://github.com/acardosoj/LeIA