In [71]:
import pandas as pd
import numpy as np

In [72]:
# Load Model
root_df = pd.read_csv("./data/data.csv")
root_df

Unnamed: 0,Intent,Message
0,Greeting,Oi
1,Greeting,Olá
2,Greeting,Ola
3,Greeting,Bom dia
4,Greeting,Boa tarde
5,Greeting,Boa noite
6,Payment,Estou com dificuldades em pagar meus boletos
7,Payment,Não consigo pagar o boleto da mensalidade
8,Payment,"Minha fatura esta atrasada, como procedo ?"
9,Payment,Não estou achando a aba de mensalidades


In [73]:
# SkLearn Imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectKBest, chi2

In [74]:
# Vectorizing X

st_words = pd.read_csv("./stop-words.txt").squeeze().tolist()

vec = CountVectorizer(lowercase=True, binary=True, stop_words=st_words)
X = vec.fit_transform(raw_documents=root_df["Message"]).toarray()

X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [75]:
# Enconding Y

le = LabelEncoder()
Y = le.fit_transform(root_df["Intent"])

Y

array([3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 5])

In [76]:
# Select Most Relevant Columns

KBest = SelectKBest(chi2, k=6)

X = KBest.fit_transform(X, Y)
X

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]], 

In [77]:
model = make_pipeline(vec, MultinomialNB())
model.fit(root_df["Message"], Y)


In [78]:
# Tests

new_message = [input()]

predict = model.predict(new_message)

print("Predict: " + le.inverse_transform(predict)[0])
print(model.predict_proba(new_message))

Predict: Greeting
[[0.08843814 0.12233943 0.07340366 0.28598828 0.09532943 0.11327725
  0.11327725 0.10794656]]


In [79]:
import joblib

joblib.dump(model, "./dump/model.sav")
joblib.dump(le, "./dump/encoder.sav")

['./dump/encoder.sav']