In [2]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
def file_parser(file):
    for line in file:
        line = re.sub("\n", "", line)
        yield re.split(r',', line, 1)

parser = file_parser(open('resultado_genero_descricao.csv', encoding="ISO-8859-1"))
columns = next(parser)


df = pd.DataFrame(parser, columns=columns)
df_test=df

In [4]:
y = df_test['GENERO']
X = df_test['DESCRICAO'].tolist()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [5]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [6]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

In [7]:
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

                                      precision    recall  f1-score   support

            ACESSORIO DE INFORMATICA       0.73      0.85      0.79        13
              ACESSORIO DE VESTUARIO       1.00      0.64      0.78        14
                 ACESSORIO DE VIAGEM       0.00      0.00      0.00         1
ACESSORIO E EQUIPAMENTO PARA ESPORTE       0.00      0.00      0.00         7
                ACESSORIO ELETRONICO       0.00      0.00      0.00         7
                   ACESSORIO MUSICAL       1.00      0.50      0.67         2
                 ACESSORIO PARA BEBE       1.00      0.50      0.67         6
               ACESSORIO PARA BELEZA       0.88      0.74      0.81        39
              ACESSORIO PARA CELULAR       0.00      0.00      0.00         2
              ACESSORIO PARA VEICULO       0.82      0.50      0.62        18
                              ACUCAR       1.00      0.78      0.88        18
                            ADOCANTE       1.00      0.80      

  'precision', 'predicted', average, warn_for)


In [8]:
from sklearn import metrics

In [9]:
score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)

In [10]:
print('{0:f}'.format(score))

0.867731


In [11]:
teste_predict=['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']
teste_predict_vect = vectorizer_train.transform(teste_predict) 
model.predict(teste_predict_vect)

array(['ACESSORIO DE INFORMATICA', 'UTENSILIO PARA BEBE', 'CARNE BOVINA',
       'PRODUTO DE LIMPEZA'],
      dtype='<U36')

In [12]:
def prediction_script(text):
    predict_vect = vectorizer_train.transform(text)
    yield model.predict(teste_predict_vect)

In [13]:
list(prediction_script(['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']))

[array(['ACESSORIO DE INFORMATICA', 'UTENSILIO PARA BEBE', 'CARNE BOVINA',
        'PRODUTO DE LIMPEZA'],
       dtype='<U36')]

In [14]:
filename = 'model_v1.pkl'

In [15]:
from sklearn.externals import joblib

In [16]:
with open(filename, 'wb') as file:
	joblib.dump(model, file)

In [17]:
file.close()

In [23]:
vocab_filename = 'model_vocabulary.pkl'

In [24]:
with open(vocab_filename, 'wb') as file2:
    joblib.dump(vectorizer_train.vocabulary_, vocab_filename)

In [22]:
vectorizer_train.vocabulary_

{'7896003700817': 6566,
 'topamax': 26251,
 '144': 1615,
 'ipldc': 17432,
 'pac': 21263,
 '1417': 1609,
 'terconan': 25982,
 'celebra': 11287,
 'fluir': 15406,
 'kuka': 17926,
 'grip': 16484,
 '326050306': 3934,
 'litrao': 18576,
 'ciprofloxatrin': 11716,
 'carlton': 11027,
 'lpl42': 18726,
 'satelite': 24469,
 'corticorten': 12443,
 '1x200ml': 2342,
 'nimbus': 20581,
 'avelas': 9203,
 'prednisona': 22536,
 'paracet': 21428,
 'lousa': 18705,
 '16x1of': 1963,
 '635ml': 5766,
 'r526': 23290,
 '000926': 199,
 'r0517': 23086,
 'ciclofemme': 11642,
 'extraordina': 14827,
 'ocb': 20861,
 'gir500g': 16182,
 'miramar': 19788,
 'techline': 25886,
 'klerat': 17868,
 'spaguetti': 25259,
 'douro': 13839,
 'unigyn': 26772,
 'renitec': 23718,
 'baycuten': 9542,
 '740': 6248,
 'gravtest': 16451,
 '60c': 5638,
 'fibrems': 15153,
 '6x343ml': 6057,
 'ph170': 21908,
 '0000000864': 73,
 '32und': 3963,
 'pb1': 21569,
 'indapen': 17218,
 'l68gp': 18025,
 'fastac': 14954,
 '276l': 3425,
 'citalopram': 11739,