In [None]:
import numpy as np
import pandas as pd
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
mlflow.set_tracking_uri("http://mlflow_server:5000")

from sklearn.model_selection import train_test_split

mlflow.set_experiment("Bag-Of-Words-Decision-Tree")
from sklearn.tree import DecisionTreeClassifier

#importing dataset
df_train = pd.read_csv(str( 'sample_products.csv'),sep=',')
df_test = pd.read_csv(str( 'test_products.csv'), sep=',')

mlflow.sklearn.autolog(log_models=True,log_model_signatures=True)

1o passo Removal of Stop Words
2o passo Tokenization
3o passo Stemming


In [None]:
# concatening title and tags
df_copy = df_train.copy()
df_copy["text"] = df_copy["concatenated_tags"] + " " + df_copy["query"]+ " " + df_copy["title"]
df_copy = df_copy[df_copy["concatenated_tags"].notnull()]

In [None]:
# tokenization
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['text']] 
print(df_copy['tokenized_text'].head(10))

In [None]:
# Removal of Stop Words
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df_copy['tokens'] = df_copy['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(df_copy['tokens'].head(10))


# tokenization
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['tokens']] 
print(df_copy['tokenized_text'].head(10))

In [None]:

# Stemming 
import nltk.stem
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()
# Get the stemmed_tokens
df_copy['stemmed_tokens'] = [[stemmer.stem(word) for word in tokens] for tokens in df_copy['tokenized_text']]
df_copy['stemmed_tokens'].head(10)

In [None]:
# building dictionaries

from gensim import corpora
# Build the dictionary
mydict = corpora.Dictionary(df_copy['stemmed_tokens'])
print("Total unique words:")
print(len(mydict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in mydict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, mydict.token2id[key]))
    if i == 3:
        break
    i += 1

In [None]:
#Generating Bow Vectors

import gensim
vocab_len = len(mydict)
print("Example of how the BOW words")
arr = []
for line in df_copy['stemmed_tokens']:
    print("Doc2Bow Line:")
    print(mydict.doc2bow(line))
    for word in line:
        arr.append(mydict.token2id[word])
    print("Actual line:")
    print(line)
    print("(Word, count) Tuples:")
    print([(mydict[id], count) for id, count in mydict.doc2bow(line) ])
    print("Sparse bow vector for the line")
    print(gensim.matutils.corpus2csc([mydict.doc2bow(line)],num_terms=vocab_len).toarray()[:,0])
    break
print("Sorted word id list")
print(sorted(arr))

df_copy = df_copy.fillna(0)

print(df_copy.info())

In [None]:
#Create column for each category
df_one = pd.get_dummies(df_copy.category)
print(df_one.head())
df_copy = pd.concat([df_copy, df_one], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
# Train Test Split Function
top_data_df_small = df_copy
def split_train_test(top_data_df_small, column, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(top_data_df_small[['product_id', 'seller_id','search_page','position', 'creation_date', 'price','weight','express_delivery','minimum_quantity','view_counts','order_counts', 'stemmed_tokens']], 
                                                        top_data_df_small[column], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train set")
    print(Y_train.value_counts())
    print("Value counts for Test set")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,column='category')

In [None]:
import time
OUTPUT_FOLDER =''
start_time = time.time()
vocab_len = len(mydict)
bow_filename = OUTPUT_FOLDER + 'train_review_bow.csv'
with open(bow_filename, 'w+') as bow_file:
    for index, row in X_train.iterrows():
        features = gensim.matutils.corpus2csc([mydict.doc2bow(row['stemmed_tokens'])],num_terms=vocab_len).toarray()[:,0]
        if index == 0:
            print("Header")
            header = ",".join(str(mydict[ele]) for ele in range(vocab_len))
            print(header)
            bow_file.write(header)
            bow_file.write("\n")
        line1 = ",".join( [str(vector_element) for vector_element in features] )
        bow_file.write(line1)
        bow_file.write('\n')

print("Time taken to create bow for :" + str(time.time() - start_time))

In [None]:

# Initialize the classifier object
# Fit the model with input vectors and corresponding sentiment labels
#mlflow.set_experiment(experiment_name="categorizer")
bow_clf = DecisionTreeClassifier(random_state=0)
bow_df = pd.read_csv(OUTPUT_FOLDER + 'train_review_bow.csv')
categories = df_copy['category'].unique().tolist()

In [None]:

for category in categories:
    X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,column=category)
    # Train the classifier with default parameters
    start_time = time.time()
    with mlflow.start_run(run_name='BOW_categorizer_'+category) as run:
        bow_clf.fit(bow_df, Y_train[category])
        print("Logged data and model in run {}".format(run.info.run_id))
        mlflow.sklearn.log_model(
            sk_model=bow_clf,
            artifact_path="sklearn-model",
            registered_model_name="bow-DecisionTreeClass-"+category
        )
    print("Time taken to fit the model: " + str(time.time() - start_time))

