In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def file_parser(file):
    for line in file:
        line = re.sub("\n", "", line)
        yield re.split(r',', line, 1)

parser = file_parser(open('Teste_Classificacao_Out_01-15_GENERO-PRODUTO.csv', encoding="ISO-8859-1"))
columns = next(parser)


df = pd.DataFrame(parser, columns=columns)
df_test=df

In [3]:
y = df_test['GENERO']
X = df_test['PRODUTO'].tolist()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [4]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [5]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

In [6]:
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

                                      precision    recall  f1-score   support

            ACESSORIO DE INFORMATICA       0.95      0.75      0.84        28
              ACESSORIO DE VESTUARIO       0.91      0.65      0.75        31
ACESSORIO E EQUIPAMENTO PARA ESPORTE       0.50      0.08      0.14        12
                ACESSORIO ELETRONICO       0.93      0.45      0.60        29
                   ACESSORIO MUSICAL       1.00      0.50      0.67         2
                 ACESSORIO PARA ARMA       0.00      0.00      0.00         1
                 ACESSORIO PARA BEBE       0.00      0.00      0.00         2
               ACESSORIO PARA BELEZA       0.96      0.69      0.80        35
              ACESSORIO PARA VEICULO       1.00      0.37      0.54        30
               ACESSORIOS RELIGIOSOS       0.00      0.00      0.00         1
                              ACUCAR       1.00      0.84      0.91        19
                            ADOCANTE       1.00      0.71      

  'precision', 'predicted', average, warn_for)


In [7]:
from sklearn import metrics

In [8]:
score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)

In [9]:
print('{0:f}'.format(score))

0.873233


In [10]:
teste_predict=['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']
teste_predict_vect = vectorizer_train.transform(teste_predict) 
model.predict(teste_predict_vect)

array(['ACESSORIO DE INFORMATICA', 'PRODUTO FARMACEUTICO', 'CARNE BOVINA',
       'PRODUTO DE LIMPEZA'],
      dtype='<U36')

In [11]:
pred_prob_list = model.predict_proba(teste_predict_vect).tolist()
pred_prob_list

[[1.2694218483096216e-05,
  0.12037327091806782,
  0.0029867110222834775,
  8.879864130517398e-05,
  0.0013849862548852248,
  0.0037610516739203514,
  0.00034125852965713975,
  5.075720235767034e-05,
  0.0003414004040273021,
  0.0041612074853879365,
  0.00012677211223606545,
  0.0002026242372559244,
  1.2694596940507325e-05,
  6.342949062691733e-05,
  0.0026368117363511825,
  0.00011414247736958576,
  0.002271274079908492,
  0.0010521675098133134,
  2.538843696619248e-05,
  0.008291174982308944,
  0.0024744488344592866,
  6.345028470204691e-05,
  1.2694975420485273e-05,
  0.01190261227833865,
  0.004205721331632291,
  0.005544669333075586,
  0.0021168790315421303,
  0.0006435678718401846,
  0.0054820779524680065,
  1.269535392303208e-05,
  0.0004925634648087258,
  2.5386923271932974e-05,
  0.0022860211784557784,
  0.0007573631231941958,
  0.01273958594362361,
  0.014000613729585205,
  0.0004655500815993728,
  0.001438816725429059,
  0.005359432516540352,
  0.00017742285151638274,
  0.0

In [12]:
greatest_prob = max(pred_prob_list[0])
greatest_prob

0.12037327091806782

In [13]:
def get_greatest_probabilities(pred_prob_list):
    max_probabilities = []
    for i in pred_prob_list:
       max_probabilities.append(max(i))
    return max_probabilities

In [14]:
get_greatest_probabilities(pred_prob_list)

[0.12037327091806782,
 0.4467453979499358,
 0.9926957646603939,
 0.9159996731431743]

In [15]:
def prediction_script(text):
    predict_vect = vectorizer_train.transform(text)
    yield model.predict(teste_predict_vect)

In [16]:
list(prediction_script(['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']))

[array(['ACESSORIO DE INFORMATICA', 'PRODUTO FARMACEUTICO', 'CARNE BOVINA',
        'PRODUTO DE LIMPEZA'],
       dtype='<U36')]

In [17]:
filename = 'model_v2.pkl'

In [18]:
from sklearn.externals import joblib

In [19]:
with open(filename, 'wb') as file:
	joblib.dump(model, file)

In [20]:
file.close()

In [21]:
vocab_filename = 'model_v2_vocabulary.pkl'

In [22]:
with open(vocab_filename, 'wb') as file2:
    joblib.dump(vectorizer_train.vocabulary_, vocab_filename)

In [23]:
vectorizer_train.vocabulary_

{'atip': 11368,
 '25mg': 4245,
 '30': 4731,
 'comp': 14962,
 'c1': 13056,
 'cx': 15822,
 'rexona': 28957,
 'deo': 16303,
 'aer': 10143,
 'ap': 10959,
 '90g': 9472,
 'active': 10008,
 'nistatina': 25146,
 'nistamax': 25144,
 'susp': 30981,
 'oral': 25659,
 '50ml': 6573,
 'co': 14733,
 'natur': 24843,
 'buscopan': 13007,
 'composto': 15002,
 '20ml': 3530,
 'gotas': 20091,
 'betoneira': 12135,
 'turbo': 32151,
 'esmalte': 17854,
 'color': 14876,
 'turquia': 32163,
 'lua': 22924,
 'cresc': 15588,
 '8ml': 9337,
 'lv': 23018,
 'top': 31715,
 'flor': 18887,
 'ultra': 32238,
 'qs': 27951,
 '12x1l': 1866,
 'sab': 29383,
 'protex': 27710,
 'balance': 11689,
 'pct': 26314,
 '12': 1594,
 'biq': 12314,
 'ad': 10026,
 'calã': 13375,
 'tanga': 31185,
 'sut': 30995,
 'cortininha': 15299,
 'cj': 14511,
 'cetaphil': 14069,
 'loc': 22772,
 'hid': 20626,
 '473ml': 6128,
 '101': 906,
 'fr': 19192,
 'flying': 18982,
 'horse': 20829,
 'energetico': 17609,
 '270ml': 4392,
 'mistura': 24170,
 'brioche': 12839,