In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def file_parser(file):
    for line in file:
        line = re.sub("\n", "", line)
        yield re.split(r',', line, 1)

parser = file_parser(open('Teste_Classificacao_Out_01-15_GENERO-PRODUTO.csv', encoding="ISO-8859-1"))
columns = next(parser)


df = pd.DataFrame(parser, columns=columns)
df_test=df

In [3]:
y = df_test['GENERO']
X = df_test['PRODUTO'].tolist()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [4]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [5]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

In [6]:
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

                                      precision    recall  f1-score   support

            ACESSORIO DE INFORMATICA       0.86      0.73      0.79        26
              ACESSORIO DE VESTUARIO       0.95      0.75      0.84        28
ACESSORIO E EQUIPAMENTO PARA ESPORTE       0.56      0.38      0.45        13
                ACESSORIO ELETRONICO       1.00      0.46      0.63        24
                   ACESSORIO MUSICAL       1.00      0.25      0.40         8
               ACESSORIO PARA BELEZA       0.95      0.51      0.67        35
              ACESSORIO PARA CALCADO       0.00      0.00      0.00         1
              ACESSORIO PARA CELULAR       0.00      0.00      0.00         2
              ACESSORIO PARA VEICULO       0.92      0.39      0.55        31
               ACESSORIOS RELIGIOSOS       0.00      0.00      0.00         2
                              ACUCAR       1.00      0.88      0.94        26
                            ADOCANTE       1.00      0.56      

  'precision', 'predicted', average, warn_for)


In [7]:
from sklearn import metrics

In [8]:
score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)

In [9]:
print('{0:f}'.format(score))

0.875530


In [10]:
teste_predict=['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']
teste_predict_vect = vectorizer_train.transform(teste_predict) 
model.predict(teste_predict_vect)

array(['ACESSORIO DE INFORMATICA', 'PRODUTO FARMACEUTICO', 'CARNE BOVINA',
       'PRODUTO DE LIMPEZA'],
      dtype='<U36')

In [11]:
pred_prob_list = model.predict_proba(teste_predict_vect).tolist()
pred_prob_list

[[1.2759417400978938e-05,
  0.11870849630246057,
  0.0030376964546336683,
  8.925469729036973e-05,
  0.0013800345796319978,
  0.0038400510348687807,
  0.0002670557212576929,
  6.376285701902094e-05,
  0.00036844077621280263,
  0.004184149749770171,
  0.00011469792639539536,
  0.00017824377931635702,
  1.275979794968012e-05,
  6.37552553431725e-05,
  0.0026385249951679417,
  0.00010200840854907593,
  0.002198743263226014,
  0.0010327506775970894,
  2.5518834801957923e-05,
  0.008416530550151983,
  0.0024745402594413997,
  6.377616431454397e-05,
  1.276017852108159e-05,
  0.011821030941695133,
  0.004249178442972363,
  0.005813783351069496,
  0.0022251384389938853,
  1.275979794968012e-05,
  0.0006342632767817477,
  0.005366300408446221,
  0.0005330498427453377,
  3.827140375609714e-05,
  0.0022505949739219594,
  0.000773984791290833,
  0.012916800704967807,
  0.01394488541339684,
  0.0004554676195881612,
  0.0014466238026258366,
  0.005364031914816277,
  0.0002037374917339481,
  0.00132

In [12]:
greatest_prob = max(pred_prob_list[0])
greatest_prob

0.11870849630246057

In [13]:
def get_greatest_probabilities(pred_prob_list):
    max_probabilities = []
    for i in pred_prob_list:
       max_probabilities.append(max(i))
    return max_probabilities

In [14]:
get_greatest_probabilities(pred_prob_list)

[0.11870849630246057, 0.440921456544193, 0.992694670203103, 0.9100697920559447]

In [15]:
def prediction_script(text):
    predict_vect = vectorizer_train.transform(text)
    yield model.predict(teste_predict_vect)

In [16]:
list(prediction_script(['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']))

[array(['ACESSORIO DE INFORMATICA', 'PRODUTO FARMACEUTICO', 'CARNE BOVINA',
        'PRODUTO DE LIMPEZA'],
       dtype='<U36')]

In [17]:
filename = 'model_v2.pkl'

In [18]:
from sklearn.externals import joblib

In [19]:
with open(filename, 'wb') as file:
	joblib.dump(model, file)

In [20]:
file.close()

In [21]:
vocab_filename = 'model_v2_vocabulary.pkl'

In [22]:
with open(vocab_filename, 'wb') as file2:
    joblib.dump(vectorizer_train.vocabulary_, vocab_filename)

In [23]:
vectorizer_train.vocabulary_

{'limp': 22528,
 'veja': 32588,
 'uso': 32385,
 'lavanda': 22250,
 'alcool': 10287,
 'und': 32282,
 '500ml': 6426,
 'brasart': 12705,
 'hct': 20427,
 '160': 2503,
 '12': 1603,
 '30cpr': 4858,
 'rev': 28903,
 'ems': 17496,
 'cx': 15779,
 'nissin': 25094,
 'lamen': 22132,
 'cremoso': 15534,
 'picanha': 26695,
 '88': 9244,
 'grs': 20200,
 '7891079001523': 8216,
 'apontador': 10985,
 'cis': 14426,
 'dep': 16258,
 '370': 5374,
 '24un': 3996,
 'refrig': 28642,
 'fanta': 18294,
 'laranja': 22189,
 '5l': 6966,
 'desod': 16424,
 'sanit': 29552,
 'leve': 22407,
 'brisa': 12808,
 'pedra': 26347,
 '25g': 4237,
 'cj': 14462,
 'corpo': 15209,
 'tampa': 31155,
 'caixa': 13236,
 'dagua': 15975,
 '1000l': 751,
 'pec': 26318,
 'cola': 14755,
 'isopor': 21394,
 'fort': 19039,
 'fix': 18725,
 '040grs': 470,
 'tb': 31235,
 'gen': 19668,
 'ambroxol': 10579,
 'xpe': 33291,
 'inf': 21094,
 '120ml': 1648,
 'neoqu': 24932,
 'sab': 29365,
 'liq': 22614,
 'protex': 27673,
 'profunda': 27566,
 '102': 970,
 'fr': 1