In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std
import texthero as hero
import pandas as pd
import numpy as np
import pickle
import os


dataset_path = os.environ['DATASET_PATH']
model_path = os.environ['MODEL_PATH']

In [2]:
def read_clean_dataset():
    """This function open and clean the dataset from csv file
        
        Returns
        -------
        df_x : DataFrame
            Data frame with the features for training
            
        df_y : DataFrame
            Data frame with the labels for training
    """
    
    data_frame = pd.read_csv(dataset_path)
    data_frame = data_frame.drop(columns=['product_id', 'seller_id', 'search_page', 'position', 'order_counts',
                                          'express_delivery', 'minimum_quantity', 'view_counts', 'creation_date'])
    
    data_frame = data_frame.dropna()
    
    df_y = data_frame['category']
    df_x = data_frame.drop(columns=['category'])
    
    return df_x, df_y


In [3]:
def random_search(model_estimator, X_train, y_train):
    """This function make and execute the random search to train RandomForest classifier
        
        Parameters
        ----------
        model_estimator : Pipeline
            The pipeline for train the model with text features
            
        X_train : DataFrame
            Data frame with the features for training
            
        y_train : DataFrame
            Data frame with the labels for training
        
        Returns
        -------
        RandomizedSearchCV :
            The model trained with the best configuration
            
    """
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    
    # Method of selecting samples for training each tree
    bootstrap = [True, False]# Create the random grid
    random_grid = {'clf__n_estimators': n_estimators,
                   'clf__max_depth': max_depth,
                   'clf__min_samples_split': min_samples_split,
                   'clf__min_samples_leaf': min_samples_leaf,
                   'clf__bootstrap': bootstrap}
    
    cv = KFold(n_splits=10, shuffle=True, random_state=1)
    rf_RandomGrid = RandomizedSearchCV(estimator = model_estimator, n_iter=1, scoring='f1_macro', 
                                       param_distributions = random_grid, cv = cv, verbose=2, n_jobs=-1)
    rf_RandomGrid.fit(X_train, y_train)
    
    return rf_RandomGrid

In [4]:
def train_cross_val(model_estimator, X_train, y_train):
    """This function execute the model training and evaluate the fit model with cross validation score
       Parameters
        ----------
        model_estimator : Pipeline
            The pipeline for train the model with text features
        
        X_train : DataFrame
            Data frame with the features for training
            
        y_train : DataFrame
            Data frame with the labels for training
        
        Returns
        -------
        model :
            The model trained with a simple fit
        
        scores :
            The F1 scores results for cross validation 
    """
    
    model = model_estimator.fit(X_train, y_train)
    cv = KFold(n_splits=10, shuffle=True, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='f1_macro', cv=cv, n_jobs=-1)
    
    return model, scores

In [5]:
def save_model(pipeline):
    """This function the pipeline in file
        
        Parameters
        ----------
        pipeline : Pipeline
            The pipeline for save model in file
        
        
        Returns
        -------
        model :
            The file with the model saved
            
    """
    with open(model_path, "wb") as file:
        model = pickle.dump(pipeline, file, protocol=pickle.HIGHEST_PROTOCOL)
        
    return model

In [6]:
def pipeline(random_search = False):
    """This function execute the training pipeline for RandomForest classifier with the dataset
       
       Parameters
       ----------
       random_search : boolean (Default=False)
           This value define if the pipeline execute with RandomSearch or only a simple fit cross validation
            
    """
    
    df_x, df_y = read_clean_dataset()
    
    df_x['query'] = df_x['query'].pipe(hero.clean)
    df_x['title'] = df_x['title'].pipe(hero.clean)
    df_x['concatenated_tags'] = df_x['concatenated_tags'].pipe(hero.clean)
       
    # Partition data
    X_train, X_test, y_train, y_test = train_test_split(df_x, 
                                                        df_y, 
                                                        test_size=.2, 
                                                        random_state=12345)   
    # Define categorical pipeline
    cat_title_pipe = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()) 
    ])
    
    cat_query_pipe = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer())
    ])
    
    cat_tags_pipe = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()) 
    ])

    # Define numerical pipeline
    num_pipe = Pipeline([
        ('scaler', MinMaxScaler())
    ])
    
    #TODO: Using only title as text feature, change comment for use others columns as features
    preprocessor = ColumnTransformer(
                        transformers=[
                            ("cat_title", cat_title_pipe, 'title'),
                            #("num", num_pipe, make_column_selector(dtype_include=np.number)),
                            #("cat_query", cat_query_pipe, 'query'),
                            #("cat_tags", cat_tags_pipe, 'concatenated_tags'),
                        ]
                    )
    
    # Combine categorical and numerical pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', RandomForestClassifier())
    ])
     
    if(random_search):
        best_model = random_search(pipe, X_train, y_train)
        print (f'Test Accuracy - : {best_model.score(X_test, y_test):.3f}')
    else:
        best_model, scores = train_cross_val(pipe, X_train, y_train)
        print('F1: %.3f (%.3f)' % (mean(scores), std(scores)))
        
    save_model(pipe)
            

In [7]:
pipeline()

F1: 0.848 (0.008)
