# Classificador de Intenções
Pré processamento de textos - comparação entre Stemming X Lemmatization


In [1]:

import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

In [4]:
# ===== Dataset sintético =====

df = pd.read_csv('dataset_bancario.csv')
df.rename(columns={"frase": "texto", "label": "classe"}, inplace=True)
df.head()


Unnamed: 0,texto,classe
0,Cobrar o valor total de R$ 2338,Cobrança
1,Cobrar a quantia de R$ 4137 referente ao serviço,Cobrança
2,Gerar cobrança automática de R$ 4088,Cobrança
3,Passe a fatura de R$ 1724 no cartão,Cobrança
4,"Cobrar taxa de R$ 2370,77 pela entrega",Cobrança


In [5]:
# Codificação das labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["classe"])
df.head()

Unnamed: 0,texto,classe,label
0,Cobrar o valor total de R$ 2338,Cobrança,0
1,Cobrar a quantia de R$ 4137 referente ao serviço,Cobrança,0
2,Gerar cobrança automática de R$ 4088,Cobrança,0
3,Passe a fatura de R$ 1724 no cartão,Cobrança,0
4,"Cobrar taxa de R$ 2370,77 pela entrega",Cobrança,0


## Stemming
 - Mac Morpho
 - Stemmatização


In [6]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.corpus import wordnet as wn

from nltk.corpus import mac_morpho
from nltk.stem import RSLPStemmer


nltk.download('rslp')
nltk.download('mac_morpho')
nltk.download('stopwords')


# Treinando um tagger simples com mac_morpho
train_data = mac_morpho.tagged_sents()
#print(train_data)
tagger = nltk.UnigramTagger(train_data)
print(train_data)
# StopWords
stopwords_pt = set(stopwords.words('portuguese'))

stemmer = RSLPStemmer()

# to lower case
df['texto_tratado'] = df['texto'].str.lower()

for index, entry in enumerate(df['texto_tratado']):
    tokens = nltk.word_tokenize(entry, language='portuguese') 
    tagged = tagger.tag(tokens)  	
    #print(tagged)
    final_words = []
    for word, tag in tagged:
        if word not in stopwords_pt and word.isalpha():
            final_words.append(stemmer.stem(word))

    df.loc[index, 'texto_final_stem'] = ' '.join(final_words)


[nltk_data] Downloading package rslp to
[nltk_data]     /Users/felipemenezes/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package mac_morpho to
[nltk_data]     /Users/felipemenezes/nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felipemenezes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milhão', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('São', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposição', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('começa', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]


In [7]:
df[df['label'] == 2].head()

Unnamed: 0,texto,classe,label,texto_tratado,texto_final_stem
360,Agendar revisão do carro,Outro,2,agendar revisão do carro,agend revis carr
361,Pedir orçamento para reforma,Outro,2,pedir orçamento para reforma,ped orç reform
362,Agendar ligação para amanhã,Outro,2,agendar ligação para amanhã,agend lig amanhã
363,Solicitar orçamento de compra,Outro,2,solicitar orçamento de compra,solic orç compr
364,Anotar lembrete para reunião,Outro,2,anotar lembrete para reunião,anot lembret reun


In [8]:
df[df['label'] == 3].head()

Unnamed: 0,texto,classe,label,texto_tratado,texto_final_stem
120,"Efetuar o pagamento da fatura de R$ 4600,19",Pagamento,3,"efetuar o pagamento da fatura de r$ 4600,19",efetu pag fatur r
121,Transferir R$ 575 para a conta poupança,Pagamento,3,transferir r$ 575 para a conta poupança,transfer r cont poupanç
122,"Pagar a conta de luz no valor de R$ 328,49",Pagamento,3,"pagar a conta de luz no valor de r$ 328,49",pag cont luz val r
123,"Transferir R$ 350,79 para a conta poupança",Pagamento,3,"transferir r$ 350,79 para a conta poupança",transfer r cont poupanç
124,Transferir R$ 2644 para a conta poupança,Pagamento,3,transferir r$ 2644 para a conta poupança,transfer r cont poupanç


## Stemming LSTM

In [9]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# ===== 2. Tokenização =====
vocab_size = 5000
tokenizer2 = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer2.fit_on_texts(df["texto_final_stem"])
sequences = tokenizer2.texts_to_sequences(df["texto_final_stem"])
max_len = max(len(s) for s in sequences)  # tamanho máximo das frases
#max_len = int(np.percentile([len(s) for s in sequences], 95))
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = df["label"].values
#print(y)

# Split treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Class weights para balancear
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# ===== Modelo LSTM =====
voc_size = len(tokenizer2.word_index) + 1
embedding_vector_features = 40 #20

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=max_len))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_len))
print(model.summary())

# ===== Treino =====
model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=12 #, class_weight=class_weights_dict
)


2025-09-02 20:29:00.652709: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-09-02 20:29:00.652896: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-09-02 20:29:00.652904: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1756855740.653481 7293902 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1756855740.653576 7293902 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


None
Epoch 1/15


2025-09-02 20:29:01.307776: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.3257 - loss: 1.3647 - val_accuracy: 0.4167 - val_loss: 1.2242
Epoch 2/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.6639 - loss: 0.9400 - val_accuracy: 0.9792 - val_loss: 0.3668
Epoch 3/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.9117 - loss: 0.2922 - val_accuracy: 1.0000 - val_loss: 0.0766
Epoch 4/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 1.0000 - loss: 0.0334 - val_accuracy: 1.0000 - val_loss: 0.0050
Epoch 5/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 1.0000 - loss: 0.0053 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 6/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 1.0000 - loss: 0.0021 - val_accuracy: 1.0000 - val_loss: 0.0018
Epoch 7/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x3211a7610>

In [10]:

# ===== 7. Avaliação =====
loss, acc = model.evaluate(X_test, y_test)
print(f"\nAcurácia no teste: {acc:.4f}")


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 3.4125e-04

Acurácia no teste: 1.0000


In [11]:
# ===== 8. Testando previsão =====
nova_frase = ["pagar a conta de luz".lower()]
nova_frase = [word_tokenize(entry, language='portuguese') for entry in nova_frase]

tagged = tagger.tag(nova_frase[0]) 
final_words = []
for word, tag in tagged:
        if word not in stopwords_pt and word.isalpha():
            final_words.append(stemmer.stem(word))

print(final_words)

seq = tokenizer2.texts_to_sequences([final_words])
padded = pad_sequences(seq, maxlen=max_len, padding='post')

# Faz predição
pred = model.predict(padded)[0] 

print(pred)

# Obtém índice da classe com maior probabilidade
classe_idx = np.argmax(pred)
prob = pred[classe_idx]

print(prob)

print("\nFrase:", nova_frase[0])
print("Classe prevista:", label_encoder.inverse_transform([classe_idx]))
print("Probabilidade:", prob)

['pag', 'cont', 'luz']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
[6.5074205e-06 4.3971741e-04 9.0734616e-02 9.0881914e-01]
0.90881914

Frase: ['pagar', 'a', 'conta', 'de', 'luz']
Classe prevista: ['Pagamento']
Probabilidade: 0.90881914


In [12]:

# ==== Dataset de teste ====
dataset2 = [
    ("Pagar conta de luz", None),
    ("Trasferir R$5", None),
    ("Informe meu saldo", None),
    ("Cobra dez reais", None)
]
dfSample = pd.DataFrame(dataset2, columns=["texto", "classe_pred"])



In [13]:
def prever_classe(texto):
    frase_proc = word_tokenize(texto.lower(), language='portuguese')
    tagged = tagger.tag(frase_proc)
    
    final_words = [
        stemmer.stem(word) 
        for word, tag in tagged 
        if word not in stopwords_pt and word.isalpha()
    ]
    
    # Sequência → padding
    seq = tokenizer2.texts_to_sequences([final_words])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    
    # Predição
    pred = model.predict(padded, verbose=0)[0]
    classe_idx = np.argmax(pred)
    prob = pred[classe_idx]
    classe = label_encoder.inverse_transform([classe_idx])[0]
    return classe, prob


In [14]:
# ==== Preencher previsões ====
dfSample["classe_pred"] = dfSample["texto"].apply(
    lambda x: prever_classe(x)[0]
)
dfSample["probabilidade"] = dfSample["texto"].apply(
    lambda x: prever_classe(x)[1]
)

print(dfSample)

                texto     classe_pred  probabilidade
0  Pagar conta de luz       Pagamento       0.908819
1       Trasferir R$5       Pagamento       0.533854
2   Informe meu saldo  Consulta Saldo       0.999572
3     Cobra dez reais        Cobrança       0.948740


## Lemmatization LSTM

In [15]:
from nltk.stem import WordNetLemmatizer
# Lemmatization 
lemmatizer = WordNetLemmatizer()
tag_map = {
    "N": "n", "NPROP": "n", "NUM": "n", "ART": "n",
    "ADJ": "a", "PROADJ": "a",
    "V": "v", "PCP": "v",
    "ADV": "r", "ADV-KS": "r"
}
for index, entry in enumerate(df['texto_tratado']):
    tokens = nltk.word_tokenize(entry, language='portuguese') 
    tagged = tagger.tag(tokens)  	
    final_words = []
    for word, tag in tagged:
        if word not in stopwords_pt and word.isalpha():
            wn_pos = tag_map.get(tag, "n")  # default "n" (substantivo)
            lemma = lemmatizer.lemmatize(word, wn_pos)
            final_words.append(lemma.lower())
 
    df.loc[index, 'texto_final_lemm'] = ' '.join(final_words)

In [19]:
df.head()

Unnamed: 0,texto,classe,label,texto_tratado,texto_final_stem,texto_final_lemm
0,Cobrar o valor total de R$ 2338,Cobrança,0,cobrar o valor total de r$ 2338,cobr val total r,cobrar valor total r
1,Cobrar a quantia de R$ 4137 referente ao serviço,Cobrança,0,cobrar a quantia de r$ 4137 referente ao serviço,cobr quant r refer serviç,cobrar quantia r referente serviço
2,Gerar cobrança automática de R$ 4088,Cobrança,0,gerar cobrança automática de r$ 4088,ger cobranç automá r,gerar cobrança automática r
3,Passe a fatura de R$ 1724 no cartão,Cobrança,0,passe a fatura de r$ 1724 no cartão,pass fatur r cart,passe fatura r cartão
4,"Cobrar taxa de R$ 2370,77 pela entrega",Cobrança,0,"cobrar taxa de r$ 2370,77 pela entrega",cobr tax r entreg,cobrar taxon r entrega


In [16]:
# ===== Tokenização para texto_final_lemm =====
tokenizer_lemm = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_lemm.fit_on_texts(df["texto_final_lemm"])

sequences_lemm = tokenizer_lemm.texts_to_sequences(df["texto_final_lemm"])
max_len_lemm = int(np.percentile([len(s) for s in sequences_lemm], 95))

X_lemm = pad_sequences(sequences_lemm, maxlen=max_len_lemm, padding='post')
y_lemm = df["label"].values

# ===== Split treino/teste =====
X_train_lemm, X_test_lemm, y_train_lemm, y_test_lemm = train_test_split(
    X_lemm, y_lemm, test_size=0.3, random_state=42
)

# ===== Class weights =====
class_weights_lemm = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_lemm),
    y=y_train_lemm
)
class_weights_dict_lemm = dict(enumerate(class_weights_lemm))

# ===== Modelo LSTM =====
voc_size_lemm = len(tokenizer_lemm.word_index) + 1
embedding_vector_features_lemm = 20

model_lemm = Sequential()
model_lemm.add(Embedding(voc_size_lemm, embedding_vector_features_lemm, input_length=max_len_lemm))
model_lemm.add(Bidirectional(LSTM(100)))
model_lemm.add(Dropout(0.3))
model_lemm.add(Dense(len(label_encoder.classes_), activation='softmax'))

model_lemm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lemm.build(input_shape=(None, max_len_lemm))
print(model_lemm.summary())

# ===== Treino =====
model_lemm.fit(
    X_train_lemm, y_train_lemm,
    validation_data=(X_test_lemm, y_test_lemm),
    epochs=15,
    batch_size=12,
    #class_weight=class_weights_dict_lemm
)



None
Epoch 1/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 51ms/step - accuracy: 0.3188 - loss: 1.3752 - val_accuracy: 0.7014 - val_loss: 1.2791
Epoch 2/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7067 - loss: 1.1118 - val_accuracy: 0.8750 - val_loss: 0.6121
Epoch 3/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.8303 - loss: 0.5016 - val_accuracy: 0.9722 - val_loss: 0.2600
Epoch 4/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9815 - loss: 0.2132 - val_accuracy: 1.0000 - val_loss: 0.0317
Epoch 5/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0202 - val_accuracy: 1.0000 - val_loss: 0.0056
Epoch 6/15
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0048 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 7/15
[1m28/28[0m [32

<keras.src.callbacks.history.History at 0x369b705d0>

In [17]:
def prever_classe_lemm(texto):
    # Tokeniza
    frase_proc = nltk.word_tokenize(texto.lower(), language='portuguese')
    tagged = tagger.tag(frase_proc)
    lemmatizer = WordNetLemmatizer()
    tag_map = {
        "N": "n", "NPROP": "n", "NUM": "n", "ART": "n",
        "ADJ": "a", "PROADJ": "a",
        "V": "v", "PCP": "v",
        "ADV": "r", "ADV-KS": "r"
    }
    # Aplica lematização
    final_words_lemm = []
    for word, tag in tagged:
        if word not in stopwords_pt and word.isalpha():
            wn_pos = tag_map.get(tag, "n")  # default "n" (substantivo)
            lemma = lemmatizer.lemmatize(word, wn_pos)
            final_words.append(lemma.lower())

    # Sequência → padding (agora com tokenizer3 e max_len_lemm)
    seq = tokenizer_lemm.texts_to_sequences([final_words_lemm])
    padded = pad_sequences(seq, maxlen=max_len_lemm, padding='post')

    # Predição
    pred = model_lemm.predict(padded, verbose=0)[0]
    classe_idx = np.argmax(pred)
    prob = pred[classe_idx]
    classe = label_encoder.inverse_transform([classe_idx])[0]

    return classe, prob

In [18]:
# ==== Preencher previsões ====
dfSample["classe_pred_lemm"] = dfSample["texto"].apply(
    lambda x: prever_classe_lemm(x)[0]
)
dfSample["probabilidade_lemm"] = dfSample["texto"].apply(
    lambda x: prever_classe_lemm(x)[1]
)

print(dfSample[['texto', 'classe_pred_lemm'] ])

                texto classe_pred_lemm
0  Pagar conta de luz            Outro
1       Trasferir R$5            Outro
2   Informe meu saldo            Outro
3     Cobra dez reais            Outro
