# Instalando Dependências

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Carregando dados

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

df = pd.read_excel("/content/drive/MyDrive/Iniciação Científica/IC 2022/Data/just_brute_financial_texts.xlsx")
df = df.loc[~pd.isnull(df["theme_brute"])]
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head()

Unnamed: 0,date,url,texts,is_title,theme_brute,old_index
0,2011-12-06 00:47:32+00:00,https://valor.globo.com/noticia/2011/12/06/rai...,Raia Drogasil substitui conselho fiscal por co...,True,Saúde,0
1,2012-03-06 16:07:43+00:00,https://valor.globo.com/noticia/2012/03/06/rio...,"NAIRÓBI - A Rio+20, a conferência das Nações U...",False,Financeiro,9
2,2018-01-03 17:00:00+00:00,https://exame.com/negocios/6-empresas-de-fast-...,"O foco da holding Infinity Services, dona da H...",False,Financeiro,11
3,2012-09-25 03:00:08+00:00,https://valor.globo.com/empresas/noticia/2012/...,"Em 1982, o Brasil enfrentava uma das crises fi...",False,Financeiro,12
4,2011-08-23 05:32:42+00:00,https://valor.globo.com/noticia/2011/08/23/imp...,Com um aumento de 39% nas compras feitas no ex...,False,Bens Industriais,13


# BERTopic

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(language="portuguese", nr_topics="auto", n_gram_range=(2, 3),
                       calculate_probabilities=True, verbose=True,
                       min_topic_size=20, top_n_words=10, low_memory=True)

In [None]:
def topic_probs_to_df(model,probabilities):
  number_of_topics = pd.DataFrame(model.get_topic_info().Topic[1:])
  return pd.DataFrame(probabilities,columns=number_of_topics["Topic"])

def get_all_topic_words(model, nth_topic):
  a = model.get_topic(nth_topic) #list
  return ", ".join([i[0] for i in a])   #turns list to text concatenated, with commas in between

def fit_docs_to_model(model, docs):
  topics, probabilities = model.fit_transform(docs)
  return topic_probs_to_df(model, probabilities)

def topic_info_to_df(model):
  topic_words_list = []
  df_topics = model.get_topic_info()[1:]
  for number in range(len(df_topics)):
    topic_words_list.append(get_all_topic_words(model, number))
  df_topics["words"] = topic_words_list
  return df_topics

In [None]:
documents = df["texts"].tolist()
result_df = fit_docs_to_model(topic_model, documents)
topic_info_df = topic_info_to_df(topic_model)
normalized_results = result_df.div(result_df.sum(axis=1), axis=0)
reindexed_bert_results = normalized_results.rename(columns={-1: 0})

Batches:   0%|          | 0/9227 [00:00<?, ?it/s]

2022-06-19 19:35:31,178 - BERTopic - Transformed documents to Embeddings
2022-06-19 19:53:18,084 - BERTopic - Reduced dimensionality


# Zero-Shot

In [None]:
labels = [
          ["Positivo", "Indiferente", "Negativo"],
          ["Positivo", "Indiferente", "Pessimista"],
          ["Positivo", "Indiferente", "Ruim"],
          ["Otimista", "Indiferente", "Negativo"],
          ["Otimista", "Indiferente", "Pessimista"],
          ["Otimista", "Indiferente", "Ruim"],
          ["Ótimo"   , "Indiferente", "Negativo"],
          ["Ótimo"   , "Indiferente", "Pessimista"],
          ["Ótimo"   , "Indiferente", "Ruim"],
          ["Positivo", "Neutro",      "Negativo"],
          ["Positivo", "Neutro",      "Pessimista"],
          ["Positivo", "Neutro",      "Ruim"],
          ["Otimista", "Neutro",      "Negativo"],
          ["Otimista", "Neutro",      "Pessimista"],
          ["Otimista", "Neutro",      "Ruim"],
          ["Ótimo"   , "Neutro",      "Negativo"],
          ["Ótimo"   , "Neutro",      "Pessimista"],
          ["Ótimo"   , "Neutro",      "Ruim"],
         ]

template = "O sentimento desse conjunto de palavras é {}."

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification",
                      model="joeddav/xlm-roberta-large-xnli")

In [None]:
def zero_shot_classify(classifier, topics, labels, template):
    return topics.progress_apply(lambda word_sequence: classifier(word_sequence,
                                                                  candidate_labels=labels,
                                                                  hypothesis_template=template,
                                                                  multi_label=False))

zeroshot_results = []
for label in labels:
    zeroshot_results.append(zero_shot_classify(classifier, topic_info_df["words"], label, template))

In [None]:
def mapper(item_tuple):
  index, item = item_tuple
  return dict(zip(item["labels"], item["scores"]))

topic_to_label_df = []
results_df = []
speech2theme_df = []

for i in tqdm(range(len(labels))):
    topic_to_label_df.append(pd.DataFrame(list(map(mapper, enumerate(zeroshot_results[i])))))
    speech2theme_df.append(reindexed_bert_results @ topic_to_label_df[i])
    results_df.append(pd.DataFrame(speech2theme_df[i].apply(lambda row : row.idxmax(),axis=1)))
    df["sentiment_{}".format(str(i))] = results_df[i][0]

In [None]:
from collections import Counter

def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

for i in tqdm(range(len(labels))):
    df['sentiment_{}'.format(str(i))] = df['sentiment_{}'.format(str(i))].replace(["Ótimo", "Otimista"], "Positivo")
    df['sentiment_{}'.format(str(i))] = df['sentiment_{}'.format(str(i))].replace(["Ruim", "Pessimista"], "Negativo")
    df['sentiment_{}'.format(str(i))] = df['sentiment_{}'.format(str(i))].replace("Indiferente", "Neutro")

true_sentiment = []

for i in tqdm(range(len(df))):
    sentiments = [
                  df["sentiment_0"][i],
                  df["sentiment_1"][i],
                  df["sentiment_2"][i],
                  df["sentiment_3"][i],
                  df["sentiment_4"][i],
                  df["sentiment_5"][i],
                  df["sentiment_6"][i],
                  df["sentiment_7"][i],
                  df["sentiment_8"][i],
                  df["sentiment_9"][i],
                  df["sentiment_10"][i],
                  df["sentiment_11"][i],
                  df["sentiment_12"][i],
                  df["sentiment_13"][i],
                  df["sentiment_14"][i],
                  df["sentiment_15"][i],
                  df["sentiment_16"][i],
                  df["sentiment_17"][i],
    ]
    true_sentiment.append(most_frequent(sentiments))

df["true_sentiment"] = true_sentiment

df.to_excel("/content/with_sentiments_financial_texts.xlsx", index=False)

In [None]:
df["true_sentiment"].value_counts()

In [None]:
import seaborn as sns
sns.set(rc={'figure.figsize':(16, 10)})
sns.countplot(df["true_sentiment"])