In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def file_parser(file):
    for line in file:
        line = re.sub("\n", "", line)
        yield re.split(r',', line, 1)

parser = file_parser(open('resultado_genero_descricao.csv', encoding="ISO-8859-1"))
columns = next(parser)


df = pd.DataFrame(parser, columns=columns)
df_test=df

In [3]:
y = df_test['GENERO']
X = df_test['DESCRICAO'].tolist()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [4]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [5]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

In [6]:
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

                                      precision    recall  f1-score   support

            ACESSORIO DE INFORMATICA       0.94      0.71      0.81        21
              ACESSORIO DE VESTUARIO       1.00      0.83      0.91        12
                 ACESSORIO DE VIAGEM       0.00      0.00      0.00         1
ACESSORIO E EQUIPAMENTO PARA ESPORTE       1.00      0.18      0.31        11
                ACESSORIO ELETRONICO       1.00      0.09      0.17        11
                 ACESSORIO PARA BEBE       1.00      0.75      0.86         4
               ACESSORIO PARA BELEZA       0.93      0.61      0.74        46
              ACESSORIO PARA VEICULO       0.75      0.40      0.52        15
                              ACUCAR       1.00      0.93      0.96        14
                            ADOCANTE       1.00      0.78      0.88         9
                                AGUA       0.99      0.99      0.99        71
                          AGUARDENTE       1.00      0.88      

  'precision', 'predicted', average, warn_for)


In [7]:
from sklearn import metrics

In [8]:
score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)

In [9]:
print('{0:f}'.format(score))

0.867604


In [10]:
teste_predict=['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']
teste_predict_vect = vectorizer_train.transform(teste_predict) 
model.predict(teste_predict_vect)

array(['ACESSORIO DE INFORMATICA', 'UTENSILIO PARA BEBE', 'CARNE BOVINA',
       'PRODUTO DE LIMPEZA'],
      dtype='<U36')

In [29]:
pred_prob_list = model.predict_proba(teste_predict_vect).tolist()
pred_prob_list

[[0.07634157522721999,
  0.0024458080682125244,
  3.552073387420206e-05,
  0.0008121140958612199,
  0.0016459047419680505,
  0.00035381622060739603,
  0.000705629540463977,
  0.006042376101459818,
  0.00010647770465249604,
  0.00017727116205121593,
  5.326188864499887e-05,
  0.0032324170113890846,
  0.00010652377728999795,
  0.0025431653524875643,
  0.0012268517311005444,
  1.776421158845663e-05,
  0.009349613591455955,
  0.0035351089135871433,
  0.0001242150347129695,
  0.014742888514804093,
  0.004178246249919765,
  0.006063268832590991,
  0.0027750358842035115,
  1.7762289054734897e-05,
  0.0007065661944424945,
  0.006188786701095364,
  1.7762929853070186e-05,
  0.0004070787954181244,
  0.0011371954808947623,
  0.0004253282821269854,
  0.015670713322547374,
  0.01706115975324787,
  0.0007018575996539696,
  0.001020092796950905,
  0.03641581422987874,
  0.006732851785766502,
  0.0001950825623516825,
  0.000739983089139216,
  5.324460953758747e-05,
  0.009499525040332848,
  0.00433318

In [30]:
greatest_prob = max(pred_prob_list[0])
greatest_prob

0.07634157522721999

In [32]:
def get_greatest_probabilities(pred_prob_list):
    max_probabilities = []
    for i in pred_prob_list:
       max_probabilities.append(max(i))
    return max_probabilities

In [33]:
get_greatest_probabilities(pred_prob_list)

[0.07634157522721999,
 0.5737664571318083,
 0.9629852103045713,
 0.8362871692463874]

In [11]:
def prediction_script(text):
    predict_vect = vectorizer_train.transform(text)
    yield model.predict(teste_predict_vect)

In [12]:
list(prediction_script(['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']))

[array(['ACESSORIO DE INFORMATICA', 'UTENSILIO PARA BEBE', 'CARNE BOVINA',
        'PRODUTO DE LIMPEZA'],
       dtype='<U36')]

In [13]:
filename = 'model_v1.pkl'

In [14]:
from sklearn.externals import joblib

In [15]:
with open(filename, 'wb') as file:
	joblib.dump(model, file)

In [16]:
file.close()

In [17]:
vocab_filename = 'model_vocabulary.pkl'

In [18]:
with open(vocab_filename, 'wb') as file2:
    joblib.dump(vectorizer_train.vocabulary_, vocab_filename)

In [19]:
vectorizer_train.vocabulary_

{'freez': 15602,
 '01un': 319,
 'cetomed': 11349,
 'ped': 21552,
 'hero': 16726,
 'ref5183': 23479,
 'cerveja': 11331,
 'refil269ml': 23490,
 'peixcagua': 21585,
 'l15p12x90g': 17916,
 '7899726916732': 6967,
 'fps90': 15548,
 '1036': 789,
 'biz100': 9909,
 'gsa': 16457,
 'sauza': 24382,
 '13072': 1503,
 'jaborandi': 17507,
 'p220': 21147,
 'shefa': 24735,
 'glitt': 16196,
 'sodica': 25007,
 'habito': 16567,
 '6821': 5855,
 'vitalux': 27167,
 '62790': 5719,
 '200x1': 2553,
 'carro': 11022,
 'flormaracuja': 15310,
 'soprador': 25097,
 'fortal': 15463,
 'belcomplex': 9603,
 'oetk': 20825,
 'leve12': 18299,
 'betnovate': 9725,
 'bruxa': 10356,
 'afp': 8090,
 'alergidex': 8244,
 'r2473': 23108,
 'quarteto': 22913,
 'vitalin': 27164,
 'canola': 10822,
 'furanil': 15780,
 '6x200gr': 5987,
 '380w': 4179,
 'precozido': 22428,
 'pour': 22341,
 'transitions': 26308,
 'p350ml': 21161,
 'queratina': 22954,
 '76l': 6284,
 'cliente': 11804,
 'marea': 19117,
 'salicl': 24229,
 'cloroforte': 11858,
 'e