In [1]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
import nltk
import time
import nltk.stem
import gensim

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim import corpora
from sklearn.model_selection import train_test_split




nltk.download('rslp')
nltk.download('stopwords')

mlflow.set_tracking_uri("http://mlflow_server:5000")
mlflow.set_experiment("Bag-Of-Words-Decision-Tree")

#importing dataset
df_train = pd.read_csv(str( 'sample_products.csv'),sep=',')
df_test = pd.read_csv(str( 'test_products.csv'), sep=',')

mlflow.sklearn.autolog(log_models=True,log_model_signatures=True)

[nltk_data] Downloading package rslp to /home/jovyan/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022/03/07 11:41:31 INFO mlflow.tracking.fluent: Experiment with name 'Bag-Of-Words-Decision-Tree' does not exist. Creating a new experiment.


In [2]:
# concatening title and tags
df_copy = df_train.copy()
df_copy["text"] = df_copy["concatenated_tags"] + " " + df_copy["query"]+ " " + df_copy["title"]
df_copy = df_copy[df_copy["concatenated_tags"].notnull()]

In [3]:
# tokenization

# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['text']] 
print(df_copy['tokenized_text'].head(10))

0    [mandala, mdf, espirito, santo, mandala, espir...
1    [cartao, visita, panfletos, tag, adesivos, cop...
2    [expositor, expositor, de, esmaltes, organizad...
3    [jogo, lencol, menino, lencol, berco, medidas,...
4    [adesivo, box, banheiro, adesivo, box, banheir...
5    [albuns, figurinhas, pai, lucas, album, fotos,...
6    [mini, arranjos, arranjo, de, flores, para, me...
7    [bb, lembrancinhas, maternidade, baby, lembran...
8    [dia, pais, chaveiro, dia, dos, pais, chaveiro...
9    [nascimento, manta, baby, cha, bebe, vestido, ...
Name: tokenized_text, dtype: object


In [4]:
# Removal of Stop Words


stopwords = nltk.corpus.stopwords.words('portuguese')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df_copy['tokens'] = df_copy['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(df_copy['tokens'].head(10))


# tokenization

# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['tokens']] 
print(df_copy['tokenized_text'].head(10))

0    mandala mdf espirito santo Mandala Espírito Santo
1    cartao visita panfletos tag adesivos copos lon...
2    expositor expositor esmaltes Organizador expos...
3    t jogo lencol menino lencol berco medidas lenc...
4    adesivo box banheiro adesivo box banheiro ADES...
5    albuns figurinhas pai lucas album fotos dia pa...
6    mini arranjos arranjo flores mesa Arranjo Flor...
7    bb lembrancinhas maternidade baby lembranca ma...
8         dia pais chaveiro dia pais chaveiro dia pais
9    nascimento manta baby cha bebe vestido bebe ma...
Name: tokens, dtype: object
0    [mandala, mdf, espirito, santo, mandala, espir...
1    [cartao, visita, panfletos, tag, adesivos, cop...
2    [expositor, expositor, esmaltes, organizador, ...
3    [jogo, lencol, menino, lencol, berco, medidas,...
4    [adesivo, box, banheiro, adesivo, box, banheir...
5    [albuns, figurinhas, pai, lucas, album, fotos,...
6    [mini, arranjos, arranjo, flores, mesa, arranj...
7    [bb, lembrancinhas, maternidade,

In [5]:

# Stemming 
stemmer = nltk.stem.RSLPStemmer()

# Get the stemmed_tokens
df_copy['stemmed_tokens'] = [[stemmer.stem(word) for word in tokens] for tokens in df_copy['tokenized_text']]
df_copy['stemmed_tokens'].head(10)

0    [mandal, mdf, espirit, sant, mandal, espirit, ...
1    [carta, visit, panflet, tag, ades, cop, long, ...
2    [exposi, exposi, esmalt, organiz, exposi, esmalt]
3    [jog, lencol, menin, lencol, berc, med, lencol...
4    [ades, box, banh, ades, box, banh, ades, box, ...
5    [album, figur, pai, luc, album, fot, dia, pal,...
6    [min, arranj, arranj, fl, mes, arranj, fl, orq...
7    [bb, lembranc, matern, baby, lembranc, matern,...
8           [dia, pal, chav, dia, pal, chav, dia, pal]
9    [nasc, mant, baby, cha, beb, vest, beb, mant, ...
Name: stemmed_tokens, dtype: object

In [6]:
# Build the dictionary
mydict = corpora.Dictionary(df_copy['stemmed_tokens'])
print("Total unique words:")
print(len(mydict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in mydict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, mydict.token2id[key]))
    if i == 3:
        break
    i += 1

Total unique words:
6666

Sample data from dictionary:
Word: espirit - ID: 0 
Word: mandal - ID: 1 
Word: mdf - ID: 2 
Word: sant - ID: 3 


In [7]:
#Generating Bow Vectors
vocab_len = len(mydict)
print("Example of how the BOW words")
arr = []
for line in df_copy['stemmed_tokens']:
    print("Doc2Bow Line:")
    print(mydict.doc2bow(line))
    for word in line:
        arr.append(mydict.token2id[word])
    print("Actual line:")
    print(line)
    print("(Word, count) Tuples:")
    print([(mydict[id], count) for id, count in mydict.doc2bow(line) ])
    print("Sparse bow vector for the line")
    print(gensim.matutils.corpus2csc([mydict.doc2bow(line)],num_terms=vocab_len).toarray()[:,0])
    break
print("Sorted word id list")
print(sorted(arr))

df_copy = df_copy.fillna(0)

print(df_copy.info())

Example of how the BOW words
Doc2Bow Line:
[(0, 2), (1, 2), (2, 1), (3, 2)]
Actual line:
['mandal', 'mdf', 'espirit', 'sant', 'mandal', 'espirit', 'sant']
(Word, count) Tuples:
[('espirit', 2), ('mandal', 2), ('mdf', 1), ('sant', 2)]
Sparse bow vector for the line
[2. 2. 1. ... 0. 0. 0.]
Sorted word id list
[0, 0, 1, 1, 2, 3, 3]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37998 entries, 0 to 37999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         37998 non-null  int64  
 1   seller_id          37998 non-null  int64  
 2   query              37998 non-null  object 
 3   search_page        37998 non-null  int64  
 4   position           37998 non-null  int64  
 5   title              37998 non-null  object 
 6   concatenated_tags  37998 non-null  object 
 7   creation_date      37998 non-null  object 
 8   price              37998 non-null  float64
 9   weight             37998 no

In [8]:
#Create column for each category
df_one = pd.get_dummies(df_copy.category)
print(df_one.head())
df_copy = pd.concat([df_copy, df_one], axis=1)

   Bebê  Bijuterias e Jóias  Decoração  Lembrancinhas  Outros  Papel e Cia
0     0                   0          1              0       0            0
1     0                   0          0              0       0            1
2     0                   0          0              0       1            0
3     1                   0          0              0       0            0
4     0                   0          1              0       0            0


In [9]:

# Train Test Split Function
top_data_df_small = df_copy
def split_train_test(top_data_df_small, column, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(top_data_df_small[['product_id', 'seller_id','search_page','position', 'creation_date', 'price','weight','express_delivery','minimum_quantity','view_counts','order_counts', 'stemmed_tokens']], 
                                                        top_data_df_small[column], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train set")
    print(Y_train.value_counts())
    print("Value counts for Test set")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,column='category')

Value counts for Train set
Lembrancinhas         12272
Decoração              6075
Bebê                   4861
Papel e Cia            1945
Outros                  785
Bijuterias e Jóias      660
Name: category, dtype: int64
Value counts for Test set
Lembrancinhas         5252
Decoração             2647
Bebê                  2069
Papel e Cia            805
Outros                 347
Bijuterias e Jóias     280
Name: category, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price 

In [10]:

start_time = time.time()
vocab_len = len(mydict)
bow_filename = 'train_review_bow.csv'
with open(bow_filename, 'w+') as bow_file:
    for index, row in X_train.iterrows():
        features = gensim.matutils.corpus2csc([mydict.doc2bow(row['stemmed_tokens'])],num_terms=vocab_len).toarray()[:,0]
        if index == 0:
            print("Header")
            header = ",".join(str(mydict[ele]) for ele in range(vocab_len))
            print(header)
            bow_file.write(header)
            bow_file.write("\n")
        line1 = ",".join( [str(vector_element) for vector_element in features] )
        bow_file.write(line1)
        bow_file.write('\n')

print("Time taken to create bow for :" + str(time.time() - start_time))

Header
espirit,mandal,mdf,sant,ades,canec,carta,cop,drink,long,panflet,tag,visit,esmalt,exposi,organiz,americ,berc,estamp,jog,lencol,med,menin,banh,box,de,album,dia,figur,fot,luc,pai,pal,arranj,fl,mes,min,orquid,aromariz,baby,bb,beb,confort,kit,lembranc,matern,sacol,chav,cha,mant,nasc,nom,nuv,person,vest,art,corda,crach,gratil,port,chinel,sandal,carn,cas,panel,tabu,cri,decoraca,espelh,gavet,movel,mud,predilet,azul,casori,guardanap,aniversari,ano,batiz,dec,emm,fest,folh,fst,fund,ingl,junin,locaca,mar,mur,niv,noiv,painel,thomad,yasmin,bat,liz,brind,corpor,escol,pres,feliz,divers,garden,itanha,nich,pec,henriqu,luil,toalh,berloqu,cois,nan,par,uso,financ,plann,croch,sof,difu,princip,urs,caix,bigod,cru,retrat,cheg,color,fort,miut,papel,tod,vera,acessori,enxov,sai,almof,aninh,din,dinorex,dinossaur,gael,joa,joaquim,raf,unidad,bem,capach,class,cria,divert,mickey,tapet,tutu,vind,apliqu,difer,namor,appl,com,espec,pass,passar,pap,quadr,ana,consagraca,convit,crism,lia,padr,boc,frald,mao,col,joi,nat

Time taken to create bow for :66.44623374938965


In [11]:

# Initialize the classifier object
# Fit the model with input vectors and corresponding sentiment labels
bow_clf = DecisionTreeClassifier(random_state=0)
bow_df = pd.read_csv('train_review_bow.csv')
categories = df_copy['category'].unique().tolist()

In [12]:

for category in categories:
    X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,column=category)
    # Train the classifier with default parameters
    with mlflow.start_run(run_name='BOW_categorizer_'+category) as run:
        bow_clf.fit(bow_df, Y_train[category])
        #print("Logged data and model in run {}".format(run.info.run_id))
        mlflow.sklearn.log_model(
            sk_model=bow_clf,
            artifact_path="sklearn-model",
            registered_model_name="bow-DecisionTreeClass-"+category
        )

Value counts for Train set
0    20523
1     6075
Name: Decoração, dtype: int64
Value counts for Test set
0    8753
1    2647
Name: Decoração, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70   
3   67.42     0.

Successfully registered model 'bow-DecisionTreeClass-Decoração'.
2022/03/07 11:44:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Decoração, version 1
Created version '1' of model 'bow-DecisionTreeClass-Decoração'.


Value counts for Train set
0    24653
1     1945
Name: Papel e Cia, dtype: int64
Value counts for Test set
0    10595
1      805
Name: Papel e Cia, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70   
3   67.42 

Successfully registered model 'bow-DecisionTreeClass-Papel e Cia'.
2022/03/07 11:45:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Papel e Cia, version 1
Created version '1' of model 'bow-DecisionTreeClass-Papel e Cia'.


Value counts for Train set
0    25813
1      785
Name: Outros, dtype: int64
Value counts for Test set
0    11053
1      347
Name: Outros, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70   
3   67.42     0.0   

Successfully registered model 'bow-DecisionTreeClass-Outros'.
2022/03/07 11:46:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Outros, version 1
Created version '1' of model 'bow-DecisionTreeClass-Outros'.


Value counts for Train set
0    21737
1     4861
Name: Bebê, dtype: int64
Value counts for Test set
0    9331
1    2069
Name: Bebê, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70   
3   67.42     0.0         

Successfully registered model 'bow-DecisionTreeClass-Bebê'.
2022/03/07 11:47:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Bebê, version 1
Created version '1' of model 'bow-DecisionTreeClass-Bebê'.


Value counts for Train set
0    14326
1    12272
Name: Lembrancinhas, dtype: int64
Value counts for Test set
0    6148
1    5252
Name: Lembrancinhas, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70   
3   67.4

Successfully registered model 'bow-DecisionTreeClass-Lembrancinhas'.
2022/03/07 11:48:40 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Lembrancinhas, version 1
Created version '1' of model 'bow-DecisionTreeClass-Lembrancinhas'.


Value counts for Train set
0    25938
1      660
Name: Bijuterias e Jóias, dtype: int64
Value counts for Test set
0    11120
1      280
Name: Bijuterias e Jóias, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price  weight  express_delivery  minimum_quantity  view_counts  \
0   14.14    45.0                 1                36          567   
1   56.80   705.0                 1                 8          167   
2   18.93     6.0                 1                31           70

Successfully registered model 'bow-DecisionTreeClass-Bijuterias e Jóias'.
2022/03/07 11:49:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: bow-DecisionTreeClass-Bijuterias e Jóias, version 1
Created version '1' of model 'bow-DecisionTreeClass-Bijuterias e Jóias'.
