In [39]:
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [40]:
def file_parser(file):
    for line in file:
        line = re.sub("\n", "", line)
        yield re.split(r',', line, 2)

parser = file_parser(open('produtos-ago-set-out-2017.csv', encoding="ISO-8859-1"))
columns = next(parser)


df = pd.DataFrame(parser, columns=columns)
df_test=df

In [41]:
df.columns

Index(['CLASSE_ID', 'GENERO_ID', 'DESCRICAO'], dtype='object')

In [42]:
df.GENERO_ID.unique()

array(['266', '269', '325', '1521', '1561', '1372', '1594', '1562', '583',
       '182', '179', '181', '178', '180', '1593', '221', '141', '139',
       '45', '1661', '43', '69', '1961', '70', '21', '75', '76', '66',
       '65', '62', '72', '73', '71', '67', '64', '61', '68', '356', '494',
       '1461', '206', '204', '462', '207', '201', '203', '202', '205',
       '2041', '100', '106', '91', '90', '84', '89', '88', '340', '1581',
       '701', '509', '506', '500', '505', '508', '497', '349', '351',
       '135', '131', '132', '134', '699', '79', '80', '431', '1565',
       '2021', '1389', '1596', '1481', '1390', '1391', '1881', '1401',
       '1541', '1294', '397', '348', '347', '354', '355', '2181', '280',
       '1901', '276', '281', '1564', '307', '2201', '309', '302', '313',
       '161', '167', '175', '174', '327', '162', '1302', '241', '1721',
       '1921', '271', '1501', '1941', '1861', '277', '282', '185', '275',
       '274', '314', '1563', '273', '311', '285', '184', '312

In [43]:
y = df_test['CLASSE_ID']
X = df_test['DESCRICAO'].tolist()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)

In [44]:
type(y)

pandas.core.series.Series

In [45]:
X

['ESTERCO 1T',
 'ADUBO HUMUS DE MINHOCA GNUMUS PACOTE 3KG',
 'ADUBO HUMUS DE MINHOCA GNUMUS PACOTE 5KG',
 'SULFATO DE POTASSIO OUROFERTIL EM PO SACA 25KG',
 'CALCARIO AGRICOLA CHAVES SACA 40KG',
 'FERTILIZANTE AMIORGAN SACA 50KG',
 'FERTILIZANTE SOLIDO TOP PHOS 328 50KG',
 'FERTILIZANTE YARA SACA 50KG',
 'DRIPSOL KCL STANDARD SACA 25KG',
 'DRIPSOL NKS SACA 25KG',
 'CAPA PARA ESPINGARDA ALMOFADA CAMUFLADA',
 'CHUMBINHO CHAKAL VELOZ PARA ESPINGARDA 5.5MM CX 125',
 'CHUMBINHO DIABOLO PARA ESPINGARDA 5.1/2MM CX 125',
 '"CHUMBINHO LOBO SPEED PARA ESPINGARDA 5,5MM CX 125"',
 '"CHUMBINHO DIABOLO ROSSI PARA ESPINGARDA 4,5MM CX C/250"',
 'LIVRO DE RECORDACOES E MENINO',
 'ALBUM LIVRO DO BEBE MENINO 34 FOLHAS',
 'ALBUM FOTOGRAFICO 10X15 80 FOTOS',
 'POSTER MOTO HONDA BIZ',
 'CORDAO DE SILICONE PARA CRACHA',
 'PORTA CRACHA ACP HORIZONTAL COM JACARE REF:C-7ACP PACOTE COM 50',
 'CRACHA HORIZONTAL ACP 07X10 PRENDEDOR METAL',
 'CAIXA RETANGULAR PAPER BOX PARA PRESENTE 10X10CM',
 'CAIXA RETANGULAR PAP

In [46]:
vectorizer_train = CountVectorizer()
X_train = vectorizer_train.fit_transform(X_train)

In [47]:
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
X_test = vectorizer_test.transform(X_test)

In [48]:
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

        100       0.92      0.86      0.89        28
       1000       1.00      0.91      0.95        22
        103       0.00      0.00      0.00         9
        106       1.00      0.28      0.43        18
       1089       0.77      0.94      0.85       374
       1121       0.68      0.52      0.59        25
       1122       0.00      0.00      0.00         1
        135       1.00      0.64      0.78        11
       1389       0.86      0.97      0.91       438
        139       0.94      0.94      0.94        16
       1390       0.89      0.55      0.68       240
       1391       0.70      0.86      0.77       480
       1401       0.00      0.00      0.00         3
       1461       0.00      0.00      0.00         2
       1481       1.00      0.16      0.28        31
       1501       0.93      0.78      0.85        50
       1521       0.00      0.00      0.00         1
       1541       0.96      0.91      0.94   

  'precision', 'predicted', average, warn_for)


In [49]:
from sklearn import metrics

In [50]:
score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)

In [51]:
print('{0:f}'.format(score))

0.850543


In [52]:
teste_predict=['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']
teste_predict_vect = vectorizer_train.transform(teste_predict) 
model.predict(teste_predict_vect)

array(['483', '1683', '1566', '80'],
      dtype='<U4')

In [53]:
pred_prob_list = model.predict_proba(teste_predict_vect).tolist()
pred_prob_list

[[0.004065682754442102,
  0.0026051378480263427,
  0.0009130700835331218,
  0.0020518162813951894,
  0.0292016136343961,
  0.00273298955941508,
  0.0003164726878837803,
  1.5125234407844594e-05,
  3.0240595625044615e-05,
  0.00010576269104249873,
  7.553738866425423e-05,
  1.512375309081588e-05,
  0.0008369572881879161,
  4.5347572893292015e-05,
  0.02771628148000182,
  0.002070777331410779,
  0.021669346718347377,
  0.03213060074906678,
  0.0005406778502367678,
  3.024355690546226e-05,
  0.0002864069660911577,
  0.0030630429950008974,
  0.006523214634625787,
  3.024454412781924e-05,
  0.007314167635951705,
  1.5124740603260584e-05,
  4.535793264037111e-05,
  0.0004355178078208728,
  0.002343153845497228,
  0.010713640574626995,
  0.018702978043007308,
  0.0025029727250989405,
  0.0047262513600473035,
  0.0004209092910305792,
  0.0023955358057098264,
  0.0002862858677187126,
  0.005369707418361706,
  0.0064206266960378676,
  0.002649591051831202,
  0.0002262727384379561,
  0.0033604398

In [54]:
greatest_prob = max(pred_prob_list[0])
greatest_prob

0.1405027262437707

In [55]:
def get_greatest_probabilities(pred_prob_list):
    max_probabilities = []
    for i in pred_prob_list:
       max_probabilities.append(max(i))
    return max_probabilities

In [56]:
get_greatest_probabilities(pred_prob_list)

[0.1405027262437707,
 0.5807494957788668,
 0.22929725890544045,
 0.8590213241011219]

In [57]:
def prediction_script(text):
    predict_vect = vectorizer_train.transform(text)
    yield model.predict(teste_predict_vect)

In [58]:
list(prediction_script(['EPSON MAGENTO','LILLO BICO','BOV AMERICANA KG','KING 100ML JASMIM']))

[array(['483', '1683', '1566', '80'],
       dtype='<U4')]

In [59]:
filename = 'model_v2.pkl'

In [60]:
from sklearn.externals import joblib

In [61]:
with open(filename, 'wb') as file:
	joblib.dump(model, file)

In [62]:
file.close()

In [63]:
vocab_filename = 'model_v2_vocabulary.pkl'

In [64]:
with open(vocab_filename, 'wb') as file2:
    joblib.dump(vectorizer_train.vocabulary_, vocab_filename)

In [65]:
vectorizer_train.vocabulary_

{'20949': 1970,
 '140': 1112,
 'villa': 29800,
 'fritex': 15682,
 'fortsat': 15476,
 'klin': 18471,
 'grade': 16460,
 'nativa': 21529,
 'ah1237': 6179,
 'ducato': 13347,
 'oympikus': 22667,
 'courino': 11636,
 '60t': 4440,
 'ginastica': 16198,
 'biquini': 8423,
 'multifuncionais': 21265,
 'turma': 29029,
 'ultracet': 29117,
 'transilvania': 28713,
 'tiabendazol': 28327,
 'cerca': 10236,
 'fute': 15812,
 'clavulin': 10822,
 'start': 27317,
 'gia': 16172,
 'monozol': 21002,
 'cwc08abana': 12018,
 'arl8832': 7143,
 'determax': 12700,
 'antisseptico': 6911,
 'sk': 26868,
 'trilogia': 28856,
 'seninha': 26495,
 'energy': 13919,
 'endotraqueal': 13898,
 '200': 1832,
 'straik': 27393,
 'gazin': 16018,
 'strepsils': 27402,
 'comet': 11160,
 'colera': 11070,
 'keftron': 18336,
 'spazio': 27184,
 'dourada': 13228,
 'j2000': 17985,
 'cilindrica': 10630,
 'niobium': 21830,
 'metros': 20529,
 'vet': 29707,
 'temporadas': 28120,
 '213l': 2071,
 'grande': 16489,
 'programas': 24430,
 '2137004': 2069,