In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split

#importing dataset
df_train = pd.read_csv(str( 'sample_products.csv'),sep=',')
df_test = pd.read_csv(str( 'test_products.csv'), sep=',')


1o passo Removal of Stop Words
2o passo Tokenization
3o passo Stemming


In [2]:
# concatening title and tags
df_copy = df_train.copy()
df_copy["text"] = df_copy["concatenated_tags"] + " " + df_copy["query"]+ " " + df_copy["title"]
df_copy = df_copy[df_copy["concatenated_tags"].notnull()]

In [3]:
# tokenization
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['text']] 
print(df_copy['tokenized_text'].head(10))

0    [mandala, mdf, espirito, santo, mandala, espir...
1    [cartao, visita, panfletos, tag, adesivos, cop...
2    [expositor, expositor, de, esmaltes, organizad...
3    [jogo, lencol, menino, lencol, berco, medidas,...
4    [adesivo, box, banheiro, adesivo, box, banheir...
5    [albuns, figurinhas, pai, lucas, album, fotos,...
6    [mini, arranjos, arranjo, de, flores, para, me...
7    [bb, lembrancinhas, maternidade, baby, lembran...
8    [dia, pais, chaveiro, dia, dos, pais, chaveiro...
9    [nascimento, manta, baby, cha, bebe, vestido, ...
Name: tokenized_text, dtype: object


In [4]:
# Removal of Stop Words
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df_copy['tokens'] = df_copy['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(df_copy['tokens'].head(10))


# tokenization
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['tokens']] 
print(df_copy['tokenized_text'].head(10))

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    mandala mdf espirito santo Mandala Espírito Santo
1    cartao visita panfletos tag adesivos copos lon...
2    expositor expositor esmaltes Organizador expos...
3    t jogo lencol menino lencol berco medidas lenc...
4    adesivo box banheiro adesivo box banheiro ADES...
5    albuns figurinhas pai lucas album fotos dia pa...
6    mini arranjos arranjo flores mesa Arranjo Flor...
7    bb lembrancinhas maternidade baby lembranca ma...
8         dia pais chaveiro dia pais chaveiro dia pais
9    nascimento manta baby cha bebe vestido bebe ma...
Name: tokens, dtype: object
0    [mandala, mdf, espirito, santo, mandala, espir...
1    [cartao, visita, panfletos, tag, adesivos, cop...
2    [expositor, expositor, esmaltes, organizador, ...
3    [jogo, lencol, menino, lencol, berco, medidas,...
4    [adesivo, box, banheiro, adesivo, box, banheir...
5    [albuns, figurinhas, pai, lucas, album, fotos,...
6    [mini, arranjos, arranjo, flores, mesa, arranj...
7    [bb, lembrancinhas, maternidade,

In [5]:

# Stemming 
import nltk.stem
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()
# Get the stemmed_tokens
df_copy['stemmed_tokens'] = [[stemmer.stem(word) for word in tokens] for tokens in df_copy['tokenized_text']]
df_copy['stemmed_tokens'].head(10)

[nltk_data] Downloading package rslp to /home/jovyan/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


0    [mandal, mdf, espirit, sant, mandal, espirit, ...
1    [carta, visit, panflet, tag, ades, cop, long, ...
2    [exposi, exposi, esmalt, organiz, exposi, esmalt]
3    [jog, lencol, menin, lencol, berc, med, lencol...
4    [ades, box, banh, ades, box, banh, ades, box, ...
5    [album, figur, pai, luc, album, fot, dia, pal,...
6    [min, arranj, arranj, fl, mes, arranj, fl, orq...
7    [bb, lembranc, matern, baby, lembranc, matern,...
8           [dia, pal, chav, dia, pal, chav, dia, pal]
9    [nasc, mant, baby, cha, beb, vest, beb, mant, ...
Name: stemmed_tokens, dtype: object

In [7]:
# building dictionaries

from gensim import corpora
# Build the dictionary
mydict = corpora.Dictionary(df_copy['stemmed_tokens'])
print("Total unique words:")
print(len(mydict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in mydict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, mydict.token2id[key]))
    if i == 3:
        break
    i += 1

Total unique words:
6666

Sample data from dictionary:
Word: espirit - ID: 0 
Word: mandal - ID: 1 
Word: mdf - ID: 2 
Word: sant - ID: 3 


In [8]:
#Generating Bow Vectors

import gensim
vocab_len = len(mydict)
print("Example of how the BOW words")
arr = []
for line in df_copy['stemmed_tokens']:
    print("Doc2Bow Line:")
    print(mydict.doc2bow(line))
    for word in line:
        arr.append(mydict.token2id[word])
    print("Actual line:")
    print(line)
    print("(Word, count) Tuples:")
    print([(mydict[id], count) for id, count in mydict.doc2bow(line) ])
    print("Sparse bow vector for the line")
    print(gensim.matutils.corpus2csc([mydict.doc2bow(line)],num_terms=vocab_len).toarray()[:,0])
    break
print("Sorted word id list")
print(sorted(arr))

df_copy = df_copy.fillna(0)

print(df_copy.info())

Example of how the BOW words
Doc2Bow Line:
[(0, 2), (1, 2), (2, 1), (3, 2)]
Actual line:
['mandal', 'mdf', 'espirit', 'sant', 'mandal', 'espirit', 'sant']
(Word, count) Tuples:
[('espirit', 2), ('mandal', 2), ('mdf', 1), ('sant', 2)]
Sparse bow vector for the line
[2. 2. 1. ... 0. 0. 0.]
Sorted word id list
[0, 0, 1, 1, 2, 3, 3]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37998 entries, 0 to 37999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         37998 non-null  int64  
 1   seller_id          37998 non-null  int64  
 2   query              37998 non-null  object 
 3   search_page        37998 non-null  int64  
 4   position           37998 non-null  int64  
 5   title              37998 non-null  object 
 6   concatenated_tags  37998 non-null  object 
 7   creation_date      37998 non-null  object 
 8   price              37998 non-null  float64
 9   weight             37998 no

In [9]:
from sklearn.model_selection import train_test_split
# Train Test Split Function
top_data_df_small = df_copy
def split_train_test(top_data_df_small, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(top_data_df_small[['product_id', 'seller_id','search_page','position', 'creation_date', 'price','weight','express_delivery','minimum_quantity','view_counts','order_counts', 'stemmed_tokens']], 
                                                        top_data_df_small['category'], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train set")
    print(Y_train.value_counts())
    print("Value counts for Test set")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small)

Value counts for Train set
Lembrancinhas         12272
Decoração              6075
Bebê                   4861
Papel e Cia            1945
Outros                  785
Bijuterias e Jóias      660
Name: category, dtype: int64
Value counts for Test set
Lembrancinhas         5252
Decoração             2647
Bebê                  2069
Papel e Cia            805
Outros                 347
Bijuterias e Jóias     280
Name: category, dtype: int64
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
   index  product_id  seller_id  search_page  position        creation_date  \
0  29372     5795302    7931459            1         9  2014-01-15 14:59:29   
1   7069     8243708    3398473            1        10  2018-06-06 22:26:07   
2  28585    13717382    6729875            1        30  2019-05-05 12:24:52   
3  36029    14777376    9085143            1        26  2014-02-09 00:41:09   
4  11256     5644691    3645206            1        36  2013-10-15 11:07:37   

    price 

In [11]:
from gensim.models import Word2Vec
import time
# Skip-gram model (sg = 1)
#size = 1000
window = 3
min_count = 1
workers = 3
sg = 1
OUTPUT_FOLDER=''
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_'  + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(top_data_df_small['stemmed_tokens']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 2.092071294784546


In [18]:
import numpy as np
# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)
# Unique ID of the word
print("Index of the word 'mdf':")
print(sg_w2v_model.wv.key_to_index["mdf"])
# Total number of the words 
print(len(sg_w2v_model.wv))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(sg_w2v_model.wv.get_vecattr("mdf", "count"))
# Get the mean for the vectors for an example review
print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model.wv.get_vecattr(token, "count") for token in top_data_df_small['stemmed_tokens'][0]], axis=0))


Index of the word 'mdf':
42
6666
Length of the vector generated for a word
1653
Print the length after taking average of all word vectors in a sentence:
417.2857142857143


In [19]:
# Store the vectors for train data in following file
OUTPUT_FOLDER =''
word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in X_train.iterrows():
        model_vector = (np.mean([sg_w2v_model.wv.get_vecattr(token, "count") for token in row['stemmed_tokens']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

In [20]:
import time
#Import the DecisionTreeeClassifier
from sklearn.tree import DecisionTreeClassifier
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_df, Y_train['category'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))


Time taken to fit the model with word2vec vectors: 0.0857694149017334


In [22]:
from sklearn.metrics import classification_report
test_features_word2vec = []
for index, row in X_test.iterrows():
    model_vector = np.mean([sg_w2v_model.wv.get_vecattr(token, "count") for token in row['stemmed_tokens']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(classification_report(Y_test['category'],test_predictions_word2vec))


                    precision    recall  f1-score   support

              Bebê       0.00      0.00      0.00      2069
Bijuterias e Jóias       0.00      0.00      0.00       280
         Decoração       0.00      0.00      0.00      2647
     Lembrancinhas       0.46      1.00      0.63      5252
            Outros       0.00      0.00      0.00       347
       Papel e Cia       0.00      0.00      0.00       805

          accuracy                           0.46     11400
         macro avg       0.08      0.17      0.11     11400
      weighted avg       0.21      0.46      0.29     11400



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
