In [11]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [12]:
!nvidia-smi

Wed Nov 22 18:19:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 517.20       Driver Version: 517.20       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   50C    P8     7W /  N/A |     30MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
import torch
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
sns.set(rc={'figure.figsize':(10,6)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

import re
import string
import swifter
import spacy
import pickle

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import AutoModelForSequenceClassification,Trainer, TrainingArguments, pipeline, AutoTokenizer
from huggingface_hub import notebook_login

"GPU Available" if torch.cuda.is_available() else "--Not available--"

'GPU Available'

# Loading training datasets

In [4]:
#B2W Digital, one of the most prominent Latin American e-commerce, released the B2W-Reviews01, 
#an open corpus of product reviews with more than 130,000 user reviews. This dataset has two target features: 
#the binary label "recommend to a friend", and a user rate from 1 to 5 stars. Here, we only considered the user rate.
df_file = pd.read_csv("data2/archive/b2w.csv", sep=',')

#The Corpus Buscapé is a large corpus of Portuguese product reviews crawled in 2013 with more than 80,000 samples 
#from the Buscapé, a product and price search website.
#Source:  https://www.kaggle.com/datasets/fredericods/ptbr-sentiment-analysis-datasets

df_file = pd.concat([df_file[['review_text','rating']],pd.read_csv("data2/archive/buscape.csv", sep=',')[['review_text','rating']]])
df_file.shape

(217364, 2)

In [5]:
df_file.rating.value_counts()/df_file.shape[0]

5    0.373539
4    0.303284
1    0.140364
3    0.127339
2    0.055474
Name: rating, dtype: float64

In [9]:
def smart_truncate(content, length=280, suffix='...'):
    if len(content) <= length:
        return content
    else:
        return ' '.join(content[:length+1].split(' ')[0:-1])


def limpa_texto(data):
    
    tx = data.apply(lambda x: re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',str(x)))
    tx = tx.swifter.apply(lambda x: re.sub('@[^\s]+',' ',str(x))) # remover os @usuario
    tx = tx.swifter.apply(lambda x: re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', str(x))) # remover as hashtag
    #tx = tx.swifter.apply(lambda x: convert_emoticons(x))
    tx = tx.swifter.apply(lambda x: re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ ]', '',str(x)))
    #tx = tx.swifter.apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    #tx = tx.swifter.apply(lambda x: ' '.join([x for x in x.split() if x not in stop_words]))

    tx = tx.swifter.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    tx = tx.swifter.apply(lambda x: re.sub(' +', ' ', str(x))) # remover espaços em brancos
    tx = tx.swifter.apply(lambda x: x.strip())
    tx = tx.swifter.apply(lambda x: x.lower())
    tx = tx.swifter.apply(lambda x: smart_truncate(x)) #Truncate maximum twitter length 280 characters
     
    return tx

In [7]:
#Preprocessing tweet text
df_file.reset_index(drop=True, inplace=True)

df_file['tweet_text_limpo'] = limpa_texto(df_file.review_text)

# Transform rating to sentiment (1,2:negative(=0) 3:neutral(=2) 4,5:positive(=1))
sentiment = {1:0,2:0,3:2,4:1,5:1}
df_file['sentiment'] = df_file['rating'].map(sentiment).astype(np.int64)

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/217364 [00:00<?, ?it/s]

In [9]:
train_ds, validation_ds = train_test_split(df_file[['tweet_text_limpo','sentiment']],test_size=0.2, 
                                           stratify = df_file['sentiment'])

In [10]:
train_ds = Dataset.from_pandas(train_ds)
validation_ds = Dataset.from_pandas(validation_ds)

## Tokenization

In [11]:
model_ckpt = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [12]:
def tokenize(batch):
    return tokenizer(batch["tweet_text_limpo"], padding=True, truncation=True, max_length=128)

In [13]:
train_ds_encoded = train_ds.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
validation_ds_encoded = validation_ds.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

## Training

In [15]:
num_labels = 3
model_ckpt = "neuralmind/bert-base-portuguese-cased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [16]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [25]:
batch_size = 32

logging_steps = len(train_ds_encoded) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir = model_name,
                                    num_train_epochs = 2,
                                    learning_rate = 2e-5,
                                    per_device_train_batch_size = batch_size,
                                    per_device_eval_batch_size = batch_size,
                                    weight_decay=0.01,
                                    evaluation_strategy = "epoch",
                                    disable_tqdm = False,
                                    logging_steps = logging_steps,
                                    push_to_hub = False,
                                    log_level = "error")

In [26]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
train_ds_encoded = train_ds_encoded.rename_column('tweet_text_limpo', 'text')
train_ds_encoded = train_ds_encoded.rename_column('sentiment', 'label')

In [22]:
validation_ds_encoded = validation_ds_encoded.rename_column('tweet_text_limpo', 'text')
validation_ds_encoded = validation_ds_encoded.rename_column('sentiment', 'label')

In [23]:
# Handling class imbalance
class_weights = class_weight.compute_class_weight(class_weight = 'balanced',
                                                 classes = np.unique(train_ds_encoded['label']),
                                                 y = train_ds_encoded['label'])

In [28]:
from torch import nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights.astype(np.float32)).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = CustomTrainer(model = model, 
                        args = training_args,
                        compute_metrics = compute_metrics,
                        train_dataset = train_ds_encoded,
                        eval_dataset = validation_ds_encoded,
                        tokenizer = tokenizer)


trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5937,0.560088,0.813746,0.828213
2,0.5025,0.560897,0.804729,0.822059


# Save the model

In [29]:
# Save the tokenizer
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer_union_buscape/')

# Save the model
trainer.save_model('./sentiment_transfer_learning_transformer_buscape/')

# Load the model

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./sentiment_transfer_learning_transformer_buscape/")

# Load model
loaded_model = AutoModelForSequenceClassification.from_pretrained('./sentiment_transfer_learning_transformer_buscape/')

# Predicting over the test dataset

In [6]:
df_test = pd.read_excel('data2/Test.xlsx')

In [32]:
df_test.head()

Unnamed: 0,IDs aleatorios,text,sentiment
0,51954,Just posted a photo @ Parque Municipal das Ara...,neutro
1,4190,ncêndio em unidade de conservação na Amazônia ...,negativo
2,65306,Lixeiras antifauna são testadas no Parque Naci...,positivo
3,105536,Parque Nacional da Tijuca abriga maior preguiç...,positivo
4,57593,#betacaralhudosan Vídeo mostra incêndio na par...,negativo


In [7]:
#model=loaded_model.model
model = loaded_model.to('cpu')
classifier = pipeline("text-classification", model=loaded_model,tokenizer=tokenizer) 

In [8]:
def make_predictions(text):
    return classifier(text, return_all_scores=True)

In [10]:
df_test['tweet_text_limpo'] = limpa_texto(df_test.text)

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

In [36]:
df_test["preds"] = df_test["tweet_text_limpo"].apply(lambda text: make_predictions(text))



In [37]:
def probas(preds,tipo):
    if (tipo=='positive'):
        prob = preds[0][1]['score']
    elif (tipo=='negative'):
        prob = preds[0][0]['score']
    else:
        prob = preds[0][2]['score']
    return prob

df_test["positivo"] = df_test["preds"].apply(lambda text: probas(text,'positive'))
df_test["negativo"] = df_test["preds"].apply(lambda text: probas(text,'negative'))
df_test["neutro"] = df_test["preds"].apply(lambda text: probas(text,'neutral'))

In [38]:
df_test['sentiment_pred'] = df_test[['positivo','negativo','neutro']].idxmax(axis=1)

sentiment = {
    0:"negativo",
    1:"positivo",
    2:"neutro"
}

In [39]:
print(classification_report(df_test[['sentiment']], df_test[['sentiment_pred']]))

              precision    recall  f1-score   support

    negativo       0.83      0.53      0.65       436
      neutro       0.23      0.01      0.03       795
    positivo       0.44      0.95      0.60       769

    accuracy                           0.49      2000
   macro avg       0.50      0.50      0.43      2000
weighted avg       0.44      0.49      0.38      2000



# Predicting final dataset

In [None]:
df = pd.read_excel('data2/sentiment_analysis_PN.xlsx')

In [None]:
#Preprocessing tweet text
df['tweet_text_limpo'] = limpa_texto(df.text_original)

In [None]:
df["preds"] = df["tweet_text_limpo"].apply(lambda text: make_predictions(text))

In [None]:
df["positive"] = df["preds"].apply(lambda text: probas(text,'positive'))
df["negative"] = df["preds"].apply(lambda text: probas(text,'negative'))
df["neutral"] = df["preds"].apply(lambda text: probas(text,'neutral'))

In [None]:
df['sentiment'] = df[['positive','negative','neutral']].idxmax(axis=1)

In [15]:
#Predicciones feitas -- Borrar
df = pd.read_excel('sentiment_analysis_predicted_modelo_buscape.xlsx')

# Negative topic analysis in the six principal national parks in Brazil


## Topic model common parameters

In [6]:
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('portuguese'))




# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)



vectorizer_model = CountVectorizer(stop_words=list(stop_words), ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = MaximalMarginalRelevance(diversity=0.2)

sentence_model = SentenceTransformer("neuralmind/bert-base-portuguese-cased")

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
No sentence-transformers model found with name C:\Users\TROPIBIO/.cache\torch\sentence_transformers\neuralmind_bert-base-portuguese-cased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\TROPIBIO/.cache\torch\sentence_transformers\neuralmind_bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the chec

#### PARQUE NACIONAL DO IGUAÇU - Negative

In [16]:
df_nega_igua = pd.DataFrame(df.loc[(df.sentiment=='negative') & (df.UC_text=='PARQUE NACIONAL DO IGUAÇU'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nega_igua['tweet_text_limpo'] = df_nega_igua['tweet_text_limpo'].astype('string') 

In [10]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_igua['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=10" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=10, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model1n = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model
                         )

topics_negative_igua, probabilities_negative_igua = topic_model1n.fit_transform(df_nega_igua['tweet_text_limpo'],embeddings)

In [11]:
df_topic_igua = topic_model1n.get_topic_info().iloc[0:5,]

df_topic_igua['Percentage'] = topic_model1n.get_topic_info().Count/topic_model1n.get_topic_info().Count.sum()

df_topic_igua

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,457,-1_chacina parque_chacina_ruralistas_compartil...,0.146993
1,0,273,0_morto dentro_veado encontrado_encontrado mor...,0.08781
2,1,261,1_meio parque_biodiversidade_rasgar meio_reman...,0.08395
3,2,227,2_táxis_iguaçu decisão_proíbe_taxis,0.073014
4,3,148,3_atropelada_jaguatirica_próximo parque_atrope...,0.047604


#### PARQUE NACIONAL DO IGUAÇU - No Negative

In [12]:
df_nonega_igua = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']) & (df.UC_text=='PARQUE NACIONAL DO IGUAÇU'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_igua['tweet_text_limpo'] = df_nonega_igua['tweet_text_limpo'].astype('string') 

In [13]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_igua['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=10" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=30, 
                           metric='euclidean', 
                           prediction_data=True)

topic_model1nn = BERTopic(umap_model=umap_model, 
                          hdbscan_model=hdbscan_model, 
                          language="multilingual", 
                          calculate_probabilities=True, 
                          nr_topics="auto",
                          vectorizer_model=vectorizer_model,
                          ctfidf_model=ctfidf_model, 
                          representation_model=representation_model
                          )

topics_nonegative_igua, probabilities_nonegative_igua = topic_model1nn.fit_transform(df_nonega_igua['tweet_text_limpo'],embeddings)

In [14]:
df_topic_igua_nn = topic_model1nn.get_topic_info().iloc[0:5,]

df_topic_igua_nn['Percentage'] = topic_model1nn.get_topic_info().Count/topic_model1nn.get_topic_info().Count.sum()

df_topic_igua_nn

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,5548,-1_vista_iguaçú_itaipu_çu parque,0.261562
1,0,1448,0_sobe domingo_iguaçu sobe_sobe_visitantes páscoa,0.068266
2,1,761,1_milhão visitantes_iguaçu recebeu_recebeu_pes...,0.035878
3,2,732,2_pr im_pr others_at parque_others im,0.03451
4,3,651,3_iguaçu reajusta_reajusta_reajusta valor_ingr...,0.030692


#### PARQUE NACIONAL DA TIJUCA - Negative

In [15]:
df_nega_tiju = pd.DataFrame(df.loc[(df.sentiment=='negative') & (df.UC_text=='PARQUE NACIONAL DA TIJUCA'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nega_tiju['tweet_text_limpo'] = df_nega_tiju['tweet_text_limpo'].astype('string') 

In [18]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_tiju['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=6, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model2n = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_negative_tiju, probabilities_negative_tiju = topic_model2n.fit_transform(df_nega_tiju['tweet_text_limpo'],embeddings)

In [19]:
df_topic_tiju = topic_model2n.get_topic_info().iloc[1:5,]

df_topic_tiju['Percentage'] = topic_model2n.get_topic_info().Count/topic_model2n.get_topic_info().Count.sum()

df_topic_tiju

Unnamed: 0,Topic,Count,Name,Percentage
1,0,186,0_ambiental_proteção ambiental_proteção_área,0.194561
2,1,99,1_tijuca icmbio_tijuca floresta_icmbio_floresta,0.103556
3,2,88,2_46_46 anos_mora 46_idoso mora,0.09205
4,3,87,3_assaltados parque_assaltados_visitantes assa...,0.091004


#### PARQUE NACIONAL DA TIJUCA - No Negative

In [20]:
df_nonega_tiju = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']) & (df.UC_text=='PARQUE NACIONAL DA TIJUCA'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_tiju['tweet_text_limpo'] = df_nonega_tiju['tweet_text_limpo'].astype('string') 

In [22]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_tiju['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=12" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=12, 
                           metric='euclidean', 
                           prediction_data=True)

topic_model2nn = BERTopic(umap_model=umap_model, 
                          hdbscan_model=hdbscan_model, 
                          language="multilingual", 
                          calculate_probabilities=True, 
                          nr_topics="auto",
                          vectorizer_model=vectorizer_model,
                          ctfidf_model=ctfidf_model, 
                          representation_model=representation_model
                          )

topics_nonegative_tiju, probabilities_nonegative_tiju = topic_model2nn.fit_transform(df_nonega_tiju['tweet_text_limpo'],embeddings)

In [23]:
df_topic_tiju_nn = topic_model2nn.get_topic_info().iloc[0:5,]

df_topic_tiju_nn['Percentage'] = topic_model2nn.get_topic_info().Count/topic_model2nn.get_topic_info().Count.sum()

df_topic_tiju_nn

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,2565,-1_fica_aqui_lindo_natureza,0.265116
1,0,440,0_ir_ir parque_ponto alto_trilha parque,0.045478
2,1,418,1_publicar foto_acabou publicar_acabou_foto pa...,0.043204
3,2,316,2_rj im_tijuca in_in rio_at parque,0.032661
4,3,312,3_tijuca parque_viii_tijuca ix_tijuca viii,0.032248


#### PARQUE NACIONAL DO ITATIAIA - Negative

In [24]:
df_nega_ita = pd.DataFrame(df.loc[(df.sentiment=='negative') & (df.UC_text=='PARQUE NACIONAL DO ITATIAIA'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nega_ita['tweet_text_limpo'] = df_nega_ita['tweet_text_limpo'].astype('string')

In [25]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_ita['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=6, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model2n = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_negative_ita, probabilities_negative_ita = topic_model2n.fit_transform(df_nega_ita['tweet_text_limpo'],embeddings)

In [26]:
df_topic_ita = topic_model2n.get_topic_info().iloc[1:5,]

df_topic_ita['Percentage'] = topic_model2n.get_topic_info().Count/topic_model2n.get_topic_info().Count.sum()

df_topic_ita

Unnamed: 0,Topic,Count,Name,Percentage
1,0,208,0_assista_itatiaia assista_vídeo mostra_mostra...,0.207378
2,1,82,1_queda árvore_árvore_turistas parque_saída tu...,0.081755
3,2,73,2_graus_temperatura_zero_abaixo zero,0.072782
4,3,66,3_sul rio_itatiaia sul_sul_atinge,0.065803


#### PARQUE NACIONAL DO ITATIAIA - No Negative

In [27]:
df_nonega_ita = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']) & (df.UC_text=='PARQUE NACIONAL DO ITATIAIA'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_ita['tweet_text_limpo'] = df_nonega_ita['tweet_text_limpo'].astype('string') 

In [28]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_ita['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=12" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=12, 
                           metric='euclidean', 
                           prediction_data=True)

topic_model2nn = BERTopic(umap_model=umap_model, 
                          hdbscan_model=hdbscan_model, 
                          language="multilingual", 
                          calculate_probabilities=True, 
                          nr_topics="auto",
                          vectorizer_model=vectorizer_model,
                          ctfidf_model=ctfidf_model, 
                          representation_model=representation_model
                          )

topics_nonegative_ita, probabilities_nonegative_ita = topic_model2nn.fit_transform(df_nonega_ita['tweet_text_limpo'],embeddings)

In [29]:
df_topic_ita_nn = topic_model2nn.get_topic_info().iloc[0:5,]

df_topic_ita_nn['Percentage'] = topic_model2nn.get_topic_info().Count/topic_model2nn.get_topic_info().Count.sum()

df_topic_ita_nn

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,1667,-1_travessia_dias_sul rio_costa verde,0.211871
1,0,763,0_tv rio_rio sul_morre voluntários_antigos,0.096975
2,1,602,1_lá_vou_ir_pro,0.076512
3,2,251,2_santuário ecológico_itatiaia verdadeiro_verd...,0.031901
4,3,240,3_primeiro parque_anos parque_1937_criado,0.030503


#### PARQUE NACIONAL DA CHAPADA DOS VEADEIROS - Negative

In [30]:
df_nega_chapa = pd.DataFrame(df.loc[(df.sentiment=='negative') & (df.UC_text=='PARQUE NACIONAL DA CHAPADA DOS VEADEIROS'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nega_chapa['tweet_text_limpo'] = df_nega_chapa['tweet_text_limpo'].astype('string')

In [31]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_chapa['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=4, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model2n = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_negative_chapa, probabilities_negative_chapa = topic_model2n.fit_transform(df_nega_chapa['tweet_text_limpo'],embeddings)

In [32]:
df_topic_chapa = topic_model2n.get_topic_info().iloc[1:5,]

df_topic_chapa['Percentage'] = topic_model2n.get_topic_info().Count/topic_model2n.get_topic_info().Count.sum()

df_topic_chapa

Unnamed: 0,Topic,Count,Name,Percentage
1,0,174,0_publicar_acabou publicar_publicar foto_veade...,0.146096
2,1,149,1_quase 15_dosveadeiros incêndio_dosveadeiros_...,0.125105
3,2,50,2_casa perigo_carinhas habitam_carinhas_todos ...,0.041982
4,3,39,3_saudades_filtro_nada_veadeiros saudades,0.032746


#### PARQUE NACIONAL DA CHAPADA DOS VEADEIROS - No Negative

In [34]:
df_nonega_chapa = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']) & (df.UC_text=='PARQUE NACIONAL DA CHAPADA DOS VEADEIROS'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_chapa['tweet_text_limpo'] = df_nonega_chapa['tweet_text_limpo'].astype('string') 

In [36]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_chapa['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=10" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=10, 
                           metric='euclidean', 
                           prediction_data=True)

topic_model2nn = BERTopic(umap_model=umap_model, 
                          hdbscan_model=hdbscan_model, 
                          language="multilingual", 
                          calculate_probabilities=True, 
                          nr_topics="auto",
                          vectorizer_model=vectorizer_model,
                          ctfidf_model=ctfidf_model, 
                          representation_model=representation_model
                          )

topics_nonegative_chapa, probabilities_nonegative_chapa = topic_model2nn.fit_transform(df_nonega_chapa['tweet_text_limpo'],embeddings)

In [37]:
df_topic_chapa_nn = topic_model2nn.get_topic_info().iloc[0:5,]

df_topic_chapa_nn['Percentage'] = topic_model2nn.get_topic_info().Count/topic_model2nn.get_topic_info().Count.sum()

df_topic_chapa_nn

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,1174,-1_anos_sobre_fica_trilha,0.29797
1,0,246,0_mil hectares_hectares_mil_decreto,0.062437
2,1,179,1_incêndio parque_combate_combate incêndio_inc...,0.045431
3,2,177,2_veadeiros concedido_concedido_concedido inic...,0.044924
4,3,170,3_in alto_veadeiros in_im at_im,0.043147


#### PARQUE NACIONAL DOS LENÇÓIS MARANHENSES - Negative

In [39]:
df_nega_len = pd.DataFrame(df.loc[(df.sentiment=='negative') & (df.UC_text=='PARQUE NACIONAL DOS LENÇÓIS MARANHENSES'),
                                  'tweet_text_limpo']).reset_index(drop=True)

df_nega_len['tweet_text_limpo'] = df_nega_len['tweet_text_limpo'].astype('string') 

In [40]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_len['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=3, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model3n = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_negative_len, probabilities_negative_len = topic_model3n.fit_transform(df_nega_len['tweet_text_limpo'],embeddings)

In [41]:
df_topic_len = topic_model3n.get_topic_info().iloc[0:6,]

df_topic_len['Percentage'] = topic_model3n.get_topic_info().Count/topic_model3n.get_topic_info().Count.sum()

df_topic_len

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,46,-1_maranhenses_filme_maranhenses alguém_requer...,0.141104
1,0,120,0_quadriciclos parque_quadriciclos_trânsito qu...,0.368098
2,1,83,1_projeto_limites_limites parque_altera,0.254601
3,2,23,2_estradas_risco parque_põem risco_põem,0.070552
4,3,14,3_nacional lençois_lençois_lençois maranhenses...,0.042945
5,4,12,4_acidente_mortos oito_mortos_quatro mortos,0.03681


#### PARQUE NACIONAL DOS LENÇÓIS MARANHENSES - No Negative

In [42]:
df_nonega_len = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']) & (df.UC_text=='PARQUE NACIONAL DOS LENÇÓIS MARANHENSES'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_len['tweet_text_limpo'] = df_nonega_len['tweet_text_limpo'].astype('string') 

In [44]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_len['tweet_text_limpo'], show_progress_bar=False)

#We set the parameter "min_cluster_size=10" due to the greater number of observations for this park
hdbscan_model = HDBSCAN(min_cluster_size=10, 
                           metric='euclidean', 
                           prediction_data=True)

topic_model2nn = BERTopic(umap_model=umap_model, 
                          hdbscan_model=hdbscan_model, 
                          language="multilingual", 
                          calculate_probabilities=True, 
                          nr_topics="auto",
                          vectorizer_model=vectorizer_model,
                          ctfidf_model=ctfidf_model, 
                          representation_model=representation_model
                          )

topics_nonegative_len, probabilities_nonegative_len = topic_model2nn.fit_transform(df_nonega_len['tweet_text_limpo'],embeddings)

In [45]:
df_topic_len_nn = topic_model2nn.get_topic_info().iloc[0:5,]

df_topic_len_nn['Percentage'] = topic_model2nn.get_topic_info().Count/topic_model2nn.get_topic_info().Count.sum()

df_topic_len_nn

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,1160,-1_deserto_dunas_lagoas_cidade,0.299664
1,0,315,0_nacional los_los lençóis_los_en,0.081374
2,1,213,1_publicar_publicar foto_foto parque_amaro mar...,0.055025
3,2,175,2_ma im_barreirinhas ma_im at_im,0.045208
4,3,118,3_surreal landscape_landscape of_landscape_sur...,0.030483


## Principal topics in negative tweets for all national parks in Brazil

In [122]:
df_nega_all = pd.DataFrame(df.loc[(df.sentiment=='negative'),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nega_all['tweet_text_limpo'] = df_nega_all['tweet_text_limpo'].astype('string') 

In [10]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nega_all['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=20, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model_all = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_negative, probabilities_negative = topic_model_all.fit_transform(df_nega_all['tweet_text_limpo'],embeddings)

In [12]:
df_topic_all = topic_model_all.get_topic_info().iloc[0:10,]

df_topic_all['Percentage'] = topic_model_all.get_topic_info().Count/topic_model_all.get_topic_info().Count.sum()

df_topic_all

Unnamed: 0,Topic,Count,Name,Percentage
0,-1,5060,-1_pode_ão_meio_hoje,0.275179
1,0,1038,0_ba meses_pascoal ba_meses icmbio_icmbio incê...,0.05645
2,1,586,1_diamantina incêndio_gigantesco consumindo_in...,0.031869
3,2,381,2_colono_estrada colono_aceitamos_atlântica bi...,0.02072
4,3,377,3_incêndio fecha_cipó minas_gerais incêndio_fe...,0.020503
5,4,375,4_pra_noronha_cara_ministro,0.020394
6,5,369,5_tava_ir_pro_pro parque,0.020067
7,6,355,6_óleo_manchas_manchas óleo_óleo chegam,0.019306
8,7,332,7_proporções atinge_brasília incêndio_incêndio...,0.018055
9,8,324,8_acabou publicar_publicar foto_publicar_foto ...,0.01762


## Principal topics in no negative tweets for all national parks in Brazil

In [12]:
df_nonega_all = pd.DataFrame(df.loc[df.sentiment.isin(['positive','neutral']),
                                   'tweet_text_limpo']).reset_index(drop=True)

df_nonega_all['tweet_text_limpo'] = df_nonega_all['tweet_text_limpo'].astype('string') 

In [13]:
df_nonega_all.shape

(87852, 1)

In [None]:
# Initiate BERTopic
embeddings = sentence_model.encode(df_nonega_all['tweet_text_limpo'], show_progress_bar=False)

hdbscan_model = HDBSCAN(min_cluster_size=30, 
                        metric='euclidean', 
                        prediction_data=True)

topic_model_all_nn = BERTopic(umap_model=umap_model, 
                         hdbscan_model=hdbscan_model, 
                         language="multilingual", 
                         calculate_probabilities=True, 
                         nr_topics="auto",
                         vectorizer_model=vectorizer_model,
                         ctfidf_model=ctfidf_model, 
                         representation_model=representation_model)

topics_nonegative, probabilities_nonegative = topic_model_all_nn.fit_transform(df_nonega_all['tweet_text_limpo'],embeddings)

In [None]:
df_topic_nn_all = topic_model_all_nn.get_topic_info().iloc[0:10,]

df_topic_nn_all['Percentage'] = topic_model_all_nn.get_topic_info().Count/topic_model_all_nn.get_topic_info().Count.sum()

df_topic_nn_all