In [None]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from gensim import corpora
import nltk.stem
nltk.download('rslp')
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('stopwords')
mlflow.set_tracking_uri("http://mlflow_server:5000")

from sklearn.model_selection import train_test_split

mlflow.set_experiment("Doc2Vec-Decision-Tree")
from sklearn.tree import DecisionTreeClassifier

#importing dataset
df_train = pd.read_csv(str( 'sample_products.csv'),sep=',')
df_test = pd.read_csv(str( 'test_products.csv'), sep=',')

mlflow.sklearn.autolog(log_models=True,log_model_signatures=True)


1o passo Removal of Stop Words
2o passo Tokenization
3o passo Stemming


In [None]:
# concatening title and tags
df_copy = df_train.copy()
df_copy["text"] = df_copy["concatenated_tags"] + " " + df_copy["query"]+ " " + df_copy["title"]
df_copy = df_copy[df_copy["concatenated_tags"].notnull()]

In [None]:
# tokenization

# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['text']] 
print(df_copy['tokenized_text'].head(10))

In [None]:
# Removal of Stop Words

stopwords = nltk.corpus.stopwords.words('portuguese')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
df_copy['tokens'] = df_copy['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(df_copy['tokens'].head(10))


# tokenization

# Tokenize the text column to get the new column 'tokenized_text'
df_copy['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_copy['tokens']] 
print(df_copy['tokenized_text'].head(10))

In [None]:

# Stemming 

stemmer = nltk.stem.RSLPStemmer()
# Get the stemmed_tokens
df_copy['stemmed_tokens'] = [[stemmer.stem(word) for word in tokens] for tokens in df_copy['tokenized_text']]
df_copy['stemmed_tokens'].head(10)

In [None]:
# building dictionaries

# Build the dictionary
mydict = corpora.Dictionary(df_copy['stemmed_tokens'])
print("Total unique words:")
print(len(mydict.token2id))
print("\nSample data from dictionary:")
i = 0
# Print top 4 (word, id) tuples
for key in mydict.token2id.keys():
    print("Word: {} - ID: {} ".format(key, mydict.token2id[key]))
    if i == 3:
        break
    i += 1

In [None]:
#Generating Bow Vectors
vocab_len = len(mydict)
print("Example of how the BOW words")
arr = []
for line in df_copy['stemmed_tokens']:
    print("Doc2Bow Line:")
    print(mydict.doc2bow(line))
    for word in line:
        arr.append(mydict.token2id[word])
    print("Actual line:")
    print(line)
    print("(Word, count) Tuples:")
    print([(mydict[id], count) for id, count in mydict.doc2bow(line) ])
    print("Sparse bow vector for the line")
    print(gensim.matutils.corpus2csc([mydict.doc2bow(line)],num_terms=vocab_len).toarray()[:,0])
    break
print("Sorted word id list")
print(sorted(arr))

df_copy = df_copy.fillna(0)

print(df_copy.info())

#Create column for each category
df_one = pd.get_dummies(df_copy.category)
print(df_one.head())
df_copy = pd.concat([df_copy, df_one], axis=1)

In [None]:
# Train Test Split Function
top_data_df_small = df_copy
def split_train_test(top_data_df_small,category, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split(top_data_df_small[['product_id', 'seller_id','search_page','position', 'creation_date', 'price','weight','express_delivery','minimum_quantity','view_counts','order_counts', 'stemmed_tokens']], 
                                                        top_data_df_small[category], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train set")
    print(Y_train.value_counts())
    print("Value counts for Test set")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test


# Call the train_test_split
X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,category='category')

In [None]:


# TaggedDocuments are tuple of stemmed_tokens and class lable, example is printed (scroll to the right to see label)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(top_data_df_small['stemmed_tokens'])]
print(documents[1])

# Initialize the model
doc2vec_model = Doc2Vec(documents, vector_size=vocab_len, window=3, min_count=1, workers=4)

# Sample vector for the stemmed tokens
vector = doc2vec_model.infer_vector(top_data_df_small['stemmed_tokens'][0])
# Printing sample vector
print(len(vector))
print("Top 10 values in Doc2Vec inferred vector:")
print(vector[:10])

In [None]:
# Store the vectors for train data in 
doc2vec_filename =  'train_review_doc2vec.csv'
with open(doc2vec_filename, 'w+') as doc2vec_file:
    for index, row in X_train.iterrows():
        model_vector = doc2vec_model.infer_vector(row['stemmed_tokens'])
        if index == 0:
            header = ",".join(str(ele) for ele in range(vocab_len))
            doc2vec_file.write(header)
            doc2vec_file.write("\n")
        line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        doc2vec_file.write(line1)
        doc2vec_file.write('\n')

In [None]:
# Load from the filename
doc2vec_df = pd.read_csv(doc2vec_filename)

clf_decision_doc2vec = DecisionTreeClassifier()

categories = df_copy['category'].unique().tolist()

mydict.save_as_text("my_dict_Doc2Vec.txt", sort_by_word=True)

# Fit the models
for category in categories:
    X_train, X_test, Y_train, Y_test = split_train_test(top_data_df_small,category=category)
    with mlflow.start_run(run_name='Doc2Vec_categorizer_'+category) as run:
        clf_decision_doc2vec.fit(doc2vec_df, Y_train[category])
        #print("Logged data and model in run {}".format(run.info.run_id))
        mlflow.sklearn.log_model(
            sk_model=clf_decision_doc2vec,
            artifact_path="sklearn-model",
            registered_model_name="Doc2Vec-DecisionTreeClass-"+category
        )
        mlflow.log_artifact("my_dict_Doc2Vec.txt")