In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
import numpy as np
import re
import unicodedata

In [2]:
# Carregando o dataset
data = pd.read_csv('financial_phrase_bank_pt_br.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'financial_phrase_bank_pt_br.csv'

In [None]:
data.head()

Unnamed: 0,y,text,text_pt
0,neutral,Technopolis plans to develop in stages an area...,A Technopolis planeja desenvolver em etapas um...
1,negative,The international electronic industry company ...,"A Elcoteq, empresa internacional da indústria ..."
2,positive,With the new production plant the company woul...,Com a nova planta de produção a empresa aument...
3,positive,According to the company 's updated strategy f...,De acordo com a estratégia atualizada da empre...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,FINANCIAMENTO DO CRESCIMENTO DA ASPOCOMP A Asp...


In [None]:
data.shape

(4845, 3)

In [None]:
def preprocess_text(text):
    text = text.lower()  # Convertendo para minúsculas
    text = re.sub(r'[^\w\s]', '', text)  # Removendo pontuações
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')  # Removendo acentos
    return text

In [None]:
# Aplicando o pré-processamento nas colunas 'text' e 'text_pt'
data['text'] = data['text'].apply(preprocess_text)
data['text_pt'] = data['text_pt'].apply(preprocess_text)

In [None]:
data.head()

Unnamed: 0,y,text,text_pt
0,neutral,technopolis plans to develop in stages an area...,a technopolis planeja desenvolver em etapas um...
1,negative,the international electronic industry company ...,a elcoteq empresa internacional da industria e...
2,positive,with the new production plant the company woul...,com a nova planta de producao a empresa aument...
3,positive,according to the company s updated strategy fo...,de acordo com a estrategia atualizada da empre...
4,positive,financing of aspocomp s growth aspocomp is agg...,financiamento do crescimento da aspocomp a asp...


In [None]:
# Contagem das classes
class_counts = data['y'].value_counts()

In [None]:
# Identificando a classe majoritária
major_class = class_counts.idxmax()

In [None]:
# Definindo o tamanho da amostra para ser igual à classe majoritária
sample_size = class_counts[major_class]

In [None]:
# Separando os dados por classe
data_major = data[data['y'] == major_class]
data_minor = data[data['y'] != major_class]

In [None]:
# Fazendo o resample da classe minoritária
data_minor_resampled = resample(data_minor, replace=True, n_samples=sample_size, random_state=42)

In [None]:
# Combinando as classes novamente em um novo dataframe
data_balanced = pd.concat([data_major, data_minor_resampled])

In [None]:
# Vetorização do texto em TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data_balanced['text_pt']).toarray()

In [None]:
# Convertendo a coluna de classes em números usando LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data_balanced['y'])

In [None]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Modelo de classificação Random Forest
model = RandomForestClassifier()

In [None]:
param_grid = {
    'n_estimators': [50, 150, 200, 250],
    'max_depth': [None, 300, 500],
}

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Melhores parâmetros encontrados pelo RandomizedSearchCV
best_params = grid_search.best_params_

In [None]:
print(best_params)

{'max_depth': None, 'n_estimators': 250}


In [None]:
# Treinando o modelo com os melhores parâmetros encontrados
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)

In [None]:
# Fazendo as previsões no conjunto de teste
y_pred = model.predict(X_test)

In [None]:
# Computando a taxa de acerto (accuracy) e o kappa
accuracy = accuracy_score(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)

In [None]:
# Exibindo os resultados
print("Taxa de acerto (accuracy): {:.2f}%".format(accuracy * 100))
print("Kappa: {:.2f}".format(kappa))

Taxa de acerto (accuracy): 90.28%
Kappa: 0.84
